added S3 support

2024-07-15 16:12:40 +02:00 · 2024-07-15 16:12:40 +02:00 · c3bb229011
parent 28df77ea2c
commit c3bb229011
5 changed files with 53 additions and 14 deletions
--- a/dev.py
+++ b/dev.py
@ -0,0 +1,4 @@
+from src.Watcher import Watcher
+
+if __name__ == "__main__":
+    Watcher("./optar/sites.txt", "./optar/keywords.txt").watch()
--- a/requirements.txt
+++ b/requirements.txt
@ -3,3 +3,4 @@ lxml==5.2.2
 requests==2.32.3
 trafilatura==1.11.0
 beautifulsoup4==4.12.3
+boto3==1.34.144
--- a/src/SiteReader.py
+++ b/src/SiteReader.py
@ -51,11 +51,9 @@ class SiteReader:

        downloaded_url = trafilatura.fetch_url(url)
        try:
-            a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False,
-                                    date_extraction_params={'extensive_search': True, 'original_date': True})
+            a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False)
        except AttributeError:
-            a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True,
-                                    date_extraction_params={'extensive_search': True, 'original_date': True})
+            a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True)
        if a:
            json_output = json.loads(a)
            return json_output['text']
--- a/src/SiteStoreS3.py
+++ b/src/SiteStoreS3.py
@ -0,0 +1,36 @@
+import json
+import os
+from pathlib import Path
+from typing import List, Optional
+import boto3
+
+
+class SiteStoreS3:
+    def __init__(self, bucket):
+        self.bucket = bucket
+
+    def get_site_history(self, cache_path) -> Optional[list[str]]:
+        # Make sure you provide / in the end
+        prefix = cache_path
+        if cache_path[-1] != "/":
+            prefix += "/"
+
+        s3 = boto3.client("s3")
+        result = s3.list_objects_v2(Bucket=self.bucket, Prefix=cache_path, MaxKeys=21)
+        if "Contents"not in result:
+            return None
+        # return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
+        return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]])
+
+    def get_site_links(self, path):
+        s3 = boto3.resource('s3')
+        obj = s3.Object(self.bucket,path)
+        data=obj.get()['Body']
+        return json.load(data)
+
+    def persist(self, path, data):
+        s3 = boto3.resource('s3')
+        s3object = s3.Object(self.bucket, path)
+        s3object.put(
+            Body=(bytes(json.dumps(data).encode('UTF-8')))
+        )
--- a/src/Watcher.py
+++ b/src/Watcher.py
@ -5,12 +5,12 @@ from deepdiff import DeepDiff

 from src.Crawler import Crawler
 from src.SiteReader import SiteReader
-from src.SiteStore import SiteStore
+from src.SiteStoreS3 import SiteStoreS3


 class Watcher:
    def __init__(self, sites_source_path, keywords_source_path) -> None:
-        self.site_store = SiteStore()
+        self.site_store = SiteStoreS3("optar-dev-cache")
        self.site_reader = SiteReader()
        self.keywords_source_path = keywords_source_path
        self.sites_source_path = sites_source_path
@ -35,7 +35,7 @@ class Watcher:
            for site in sites:
                crawler = Crawler()
                crawler.run(site, 1)
-                crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
+                self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())

            contents = [self.get_new_content(site) for site in sites]
            # TODO: improve handleing of None
@ -56,16 +56,16 @@ class Watcher:

    def get_new_content(self, url) -> Dict[str, str]:
        """ get all past iterations of a site by the fully qualified domain name """
-        list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
+        list_of_files = self.site_store.get_site_history(f"{self.remove_protocol(url)}/")

        if len(list_of_files) >= 2:
-            prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
-            current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
+            prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
+            current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
            news = DeepDiff(prev_version, current_version, ignore_order=True)
        else:
-            news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
-
-        sites_contents = self.site_reader.get_sites_content_static(list(news.keys()))
+            news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
+        if news:
+            sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])

        return sites_contents