diff --git a/dev.py b/dev.py new file mode 100644 index 0000000..e57f248 --- /dev/null +++ b/dev.py @@ -0,0 +1,4 @@ +from src.Watcher import Watcher + +if __name__ == "__main__": + Watcher("./optar/sites.txt", "./optar/keywords.txt").watch() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c9044db..d1c12ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ deepdiff==7.0.1 lxml==5.2.2 requests==2.32.3 trafilatura==1.11.0 -beautifulsoup4==4.12.3 \ No newline at end of file +beautifulsoup4==4.12.3 +boto3==1.34.144 \ No newline at end of file diff --git a/src/SiteReader.py b/src/SiteReader.py index 9f6abae..b95696e 100644 --- a/src/SiteReader.py +++ b/src/SiteReader.py @@ -51,11 +51,9 @@ class SiteReader: downloaded_url = trafilatura.fetch_url(url) try: - a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False, - date_extraction_params={'extensive_search': True, 'original_date': True}) + a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False) except AttributeError: - a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, - date_extraction_params={'extensive_search': True, 'original_date': True}) + a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True) if a: json_output = json.loads(a) return json_output['text'] diff --git a/src/SiteStoreS3.py b/src/SiteStoreS3.py new file mode 100644 index 0000000..93f6f6c --- /dev/null +++ b/src/SiteStoreS3.py @@ -0,0 +1,36 @@ +import json +import os +from pathlib import Path +from typing import List, Optional +import boto3 + + +class SiteStoreS3: + def __init__(self, bucket): + self.bucket = bucket + + def get_site_history(self, cache_path) -> Optional[list[str]]: + # Make sure you provide / in the end + prefix = cache_path + if cache_path[-1] != "/": + prefix += "/" + + s3 = boto3.client("s3") + result = s3.list_objects_v2(Bucket=self.bucket, Prefix=cache_path, MaxKeys=21) + if "Contents"not in result: + return None + # return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix + return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]]) + + def get_site_links(self, path): + s3 = boto3.resource('s3') + obj = s3.Object(self.bucket,path) + data=obj.get()['Body'] + return json.load(data) + + def persist(self, path, data): + s3 = boto3.resource('s3') + s3object = s3.Object(self.bucket, path) + s3object.put( + Body=(bytes(json.dumps(data).encode('UTF-8'))) + ) \ No newline at end of file diff --git a/src/Watcher.py b/src/Watcher.py index e6c9db9..74bda2a 100644 --- a/src/Watcher.py +++ b/src/Watcher.py @@ -5,12 +5,12 @@ from deepdiff import DeepDiff from src.Crawler import Crawler from src.SiteReader import SiteReader -from src.SiteStore import SiteStore +from src.SiteStoreS3 import SiteStoreS3 class Watcher: def __init__(self, sites_source_path, keywords_source_path) -> None: - self.site_store = SiteStore() + self.site_store = SiteStoreS3("optar-dev-cache") self.site_reader = SiteReader() self.keywords_source_path = keywords_source_path self.sites_source_path = sites_source_path @@ -35,7 +35,7 @@ class Watcher: for site in sites: crawler = Crawler() crawler.run(site, 1) - crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json") + self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes()) contents = [self.get_new_content(site) for site in sites] # TODO: improve handleing of None @@ -56,16 +56,16 @@ class Watcher: def get_new_content(self, url) -> Dict[str, str]: """ get all past iterations of a site by the fully qualified domain name """ - list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/") + list_of_files = self.site_store.get_site_history(f"{self.remove_protocol(url)}/") if len(list_of_files) >= 2: - prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}") - current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}") + prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}") + current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}") news = DeepDiff(prev_version, current_version, ignore_order=True) else: - news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}") - - sites_contents = self.site_reader.get_sites_content_static(list(news.keys())) + news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}") + if news: + sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])]) return sites_contents