mirror of https://github.com/Askill/optar.git
added S3 support
This commit is contained in:
parent
28df77ea2c
commit
c3bb229011
|
|
@ -0,0 +1,4 @@
|
||||||
|
from src.Watcher import Watcher
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
Watcher("./optar/sites.txt", "./optar/keywords.txt").watch()
|
||||||
|
|
@ -3,3 +3,4 @@ lxml==5.2.2
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
trafilatura==1.11.0
|
trafilatura==1.11.0
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
|
boto3==1.34.144
|
||||||
|
|
@ -51,11 +51,9 @@ class SiteReader:
|
||||||
|
|
||||||
downloaded_url = trafilatura.fetch_url(url)
|
downloaded_url = trafilatura.fetch_url(url)
|
||||||
try:
|
try:
|
||||||
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False,
|
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False)
|
||||||
date_extraction_params={'extensive_search': True, 'original_date': True})
|
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True,
|
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True)
|
||||||
date_extraction_params={'extensive_search': True, 'original_date': True})
|
|
||||||
if a:
|
if a:
|
||||||
json_output = json.loads(a)
|
json_output = json.loads(a)
|
||||||
return json_output['text']
|
return json_output['text']
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,36 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
|
||||||
|
class SiteStoreS3:
|
||||||
|
def __init__(self, bucket):
|
||||||
|
self.bucket = bucket
|
||||||
|
|
||||||
|
def get_site_history(self, cache_path) -> Optional[list[str]]:
|
||||||
|
# Make sure you provide / in the end
|
||||||
|
prefix = cache_path
|
||||||
|
if cache_path[-1] != "/":
|
||||||
|
prefix += "/"
|
||||||
|
|
||||||
|
s3 = boto3.client("s3")
|
||||||
|
result = s3.list_objects_v2(Bucket=self.bucket, Prefix=cache_path, MaxKeys=21)
|
||||||
|
if "Contents"not in result:
|
||||||
|
return None
|
||||||
|
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
|
||||||
|
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]])
|
||||||
|
|
||||||
|
def get_site_links(self, path):
|
||||||
|
s3 = boto3.resource('s3')
|
||||||
|
obj = s3.Object(self.bucket,path)
|
||||||
|
data=obj.get()['Body']
|
||||||
|
return json.load(data)
|
||||||
|
|
||||||
|
def persist(self, path, data):
|
||||||
|
s3 = boto3.resource('s3')
|
||||||
|
s3object = s3.Object(self.bucket, path)
|
||||||
|
s3object.put(
|
||||||
|
Body=(bytes(json.dumps(data).encode('UTF-8')))
|
||||||
|
)
|
||||||
|
|
@ -5,12 +5,12 @@ from deepdiff import DeepDiff
|
||||||
|
|
||||||
from src.Crawler import Crawler
|
from src.Crawler import Crawler
|
||||||
from src.SiteReader import SiteReader
|
from src.SiteReader import SiteReader
|
||||||
from src.SiteStore import SiteStore
|
from src.SiteStoreS3 import SiteStoreS3
|
||||||
|
|
||||||
|
|
||||||
class Watcher:
|
class Watcher:
|
||||||
def __init__(self, sites_source_path, keywords_source_path) -> None:
|
def __init__(self, sites_source_path, keywords_source_path) -> None:
|
||||||
self.site_store = SiteStore()
|
self.site_store = SiteStoreS3("optar-dev-cache")
|
||||||
self.site_reader = SiteReader()
|
self.site_reader = SiteReader()
|
||||||
self.keywords_source_path = keywords_source_path
|
self.keywords_source_path = keywords_source_path
|
||||||
self.sites_source_path = sites_source_path
|
self.sites_source_path = sites_source_path
|
||||||
|
|
@ -35,7 +35,7 @@ class Watcher:
|
||||||
for site in sites:
|
for site in sites:
|
||||||
crawler = Crawler()
|
crawler = Crawler()
|
||||||
crawler.run(site, 1)
|
crawler.run(site, 1)
|
||||||
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
|
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
|
||||||
|
|
||||||
contents = [self.get_new_content(site) for site in sites]
|
contents = [self.get_new_content(site) for site in sites]
|
||||||
# TODO: improve handleing of None
|
# TODO: improve handleing of None
|
||||||
|
|
@ -56,16 +56,16 @@ class Watcher:
|
||||||
|
|
||||||
def get_new_content(self, url) -> Dict[str, str]:
|
def get_new_content(self, url) -> Dict[str, str]:
|
||||||
""" get all past iterations of a site by the fully qualified domain name """
|
""" get all past iterations of a site by the fully qualified domain name """
|
||||||
list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
|
list_of_files = self.site_store.get_site_history(f"{self.remove_protocol(url)}/")
|
||||||
|
|
||||||
if len(list_of_files) >= 2:
|
if len(list_of_files) >= 2:
|
||||||
prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
|
prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
|
||||||
current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
|
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||||
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
||||||
else:
|
else:
|
||||||
news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
|
news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||||
|
if news:
|
||||||
sites_contents = self.site_reader.get_sites_content_static(list(news.keys()))
|
sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])
|
||||||
|
|
||||||
return sites_contents
|
return sites_contents
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue