From aa37a25f1c5e34583951c521edcf6634c4fec175 Mon Sep 17 00:00:00 2001
From: Askill <ofjokg@gmail.com>
Date: Wed, 17 Jul 2024 20:32:59 +0200
Subject: [PATCH] fixed tests

---
 dev.py                                        |  6 ++---
 keywords.txt                                  |  2 +-
 main.py                                       |  6 ++---
 prod.py                                       |  6 ++---
 sites.txt                                     |  2 +-
 src/SiteStore.py                              | 19 --------------
 src/SiteStoreS3.py                            |  2 +-
 src/Watcher.py                                | 23 ++++++++--------
 tests/MockSiteStore.py                        | 26 +++++++++++++++++++
 .../2024-07-15_16-30-47.json                  |  0
 .../2024-07-16_16-30-47.json                  |  0
 tests/watcher_test.py                         |  6 +++--
 12 files changed, 53 insertions(+), 45 deletions(-)
 delete mode 100644 src/SiteStore.py
 create mode 100644 tests/MockSiteStore.py
 rename tests/cache/{ => www.patricematz.de}/2024-07-15_16-30-47.json (100%)
 rename tests/cache/{ => www.patricematz.de}/2024-07-16_16-30-47.json (100%)

diff --git a/dev.py b/dev.py
index 6e7887f..101c559 100644
--- a/dev.py
+++ b/dev.py
@@ -1,6 +1,6 @@
-from optar.src.Crawler import Crawler
-from optar.src.SiteReader import SiteReader
-from optar.src.SiteStoreS3 import SiteStoreS3
+from src.Crawler import Crawler
+from src.SiteReader import SiteReader
+from src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher
 
 if __name__ == "__main__":
diff --git a/keywords.txt b/keywords.txt
index 104fafc..f00bdb9 100644
--- a/keywords.txt
+++ b/keywords.txt
@@ -1 +1 @@
-Oktober
\ No newline at end of file
+Engineer
\ No newline at end of file
diff --git a/main.py b/main.py
index 78cfa9c..1f174a1 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,6 @@
-from optar.src.Crawler import Crawler
-from optar.src.SiteReader import SiteReader
-from optar.src.SiteStoreS3 import SiteStoreS3
+from src.Crawler import Crawler
+from src.SiteReader import SiteReader
+from src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher
 
 if __name__ == "__main__":
diff --git a/prod.py b/prod.py
index 6db38e0..44e5068 100644
--- a/prod.py
+++ b/prod.py
@@ -1,6 +1,6 @@
-from optar.src.Crawler import Crawler
-from optar.src.SiteReader import SiteReader
-from optar.src.SiteStoreS3 import SiteStoreS3
+from src.Crawler import Crawler
+from src.SiteReader import SiteReader
+from src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher
 
 if __name__ == "__main__":
diff --git a/sites.txt b/sites.txt
index 6b14489..ad85345 100644
--- a/sites.txt
+++ b/sites.txt
@@ -1 +1 @@
-https://www.patricematz.de/
\ No newline at end of file
+https://www.patricematz.de/CV
\ No newline at end of file
diff --git a/src/SiteStore.py b/src/SiteStore.py
deleted file mode 100644
index 322d6cf..0000000
--- a/src/SiteStore.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import json
-import os
-from typing import List, Optional
-
-
-class SiteStore:
-    def __init__(self):
-        pass
-
-    @staticmethod
-    def get_site_history(cache_path) -> Optional[list[str]]:
-        if not os.path.isdir(cache_path):
-            return None
-        return sorted(os.listdir(cache_path))
-
-    @staticmethod
-    def get_site_links(path):
-        with open(path, 'r') as fp:
-            return json.load(fp)
diff --git a/src/SiteStoreS3.py b/src/SiteStoreS3.py
index 6a05f56..74317c1 100644
--- a/src/SiteStoreS3.py
+++ b/src/SiteStoreS3.py
@@ -20,7 +20,7 @@ class SiteStoreS3:
         if "Contents"not in result:
             return None
         # return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
-        return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True)
+        return sorted([x["Key"][len(cache_path) :] for x in result["Contents"]], reverse=True)
 
     def get_site_links(self, path):
         s3 = boto3.resource('s3')
diff --git a/src/Watcher.py b/src/Watcher.py
index e6be13e..4a40b22 100644
--- a/src/Watcher.py
+++ b/src/Watcher.py
@@ -1,17 +1,13 @@
 import time
 from datetime import datetime
-from typing import List, Dict, Optional
+from typing import List, Dict
 from deepdiff import DeepDiff
 
-from optar.src.Crawler import Crawler
-from optar.src.SiteReader import SiteReader
-from optar.src.SiteStoreS3 import SiteStoreS3
-
-
 class Watcher:
+    # there should be a type hint for site_store and site_reader, referencing interfaces, which these implement, for better auto complete and DX
     def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
-        self.site_store = SiteStoreS3("optar-dev-cache")
-        self.site_reader = SiteReader()
+        self.site_store = site_store
+        self.site_reader = site_reader
         self.keywords_source_path = keywords_source_path
         self.sites_source_path = sites_source_path
 
@@ -35,6 +31,8 @@ class Watcher:
             for site in sites:
                 crawler.run(site)
                 self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
+                # do NOT overload the target
+                time.sleep(1)
 
             contents = [self.get_new_content(site) for site in sites]
             # TODO: improve handleing of None
@@ -62,10 +60,11 @@ class Watcher:
 
         if len(list_of_files) >= 2:
             prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
-            current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
-            news = DeepDiff(prev_version, current_version, ignore_order=True)
         else:
-            news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
+            prev_version = {url: []}
+        current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
+        news = DeepDiff(prev_version, current_version, ignore_order=True)
+
         if news:
             sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
             return sites_contents
@@ -73,7 +72,7 @@ class Watcher:
 
     @staticmethod
     def get_added_urls( news):
-        return [z.split("'")[1] for z in list(news["dictionary_item_added"])]
+        return [z.split("'")[1] for z in list(news["iterable_item_added"])]
 
     @staticmethod
     def search_sites(url, content, keywords: List[str]):
diff --git a/tests/MockSiteStore.py b/tests/MockSiteStore.py
new file mode 100644
index 0000000..712988f
--- /dev/null
+++ b/tests/MockSiteStore.py
@@ -0,0 +1,26 @@
+import json
+import os
+from pathlib import Path
+from typing import List, Optional
+
+
+class SiteStore:
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def get_site_history(in_path) -> Optional[list[str]]:
+        cache_path = "./cache/" + in_path
+        if not os.path.isdir(cache_path):
+            return []
+        return sorted(os.listdir(cache_path))
+
+    @staticmethod
+    def get_site_links(in_path):
+        cache_path = "./cache/" + in_path
+        with open(cache_path, 'r') as fp:
+            return json.load(fp)
+        
+    @staticmethod
+    def persist(self, data):
+        return
\ No newline at end of file
diff --git a/tests/cache/2024-07-15_16-30-47.json b/tests/cache/www.patricematz.de/2024-07-15_16-30-47.json
similarity index 100%
rename from tests/cache/2024-07-15_16-30-47.json
rename to tests/cache/www.patricematz.de/2024-07-15_16-30-47.json
diff --git a/tests/cache/2024-07-16_16-30-47.json b/tests/cache/www.patricematz.de/2024-07-16_16-30-47.json
similarity index 100%
rename from tests/cache/2024-07-16_16-30-47.json
rename to tests/cache/www.patricematz.de/2024-07-16_16-30-47.json
diff --git a/tests/watcher_test.py b/tests/watcher_test.py
index 047ec41..ae0d79e 100644
--- a/tests/watcher_test.py
+++ b/tests/watcher_test.py
@@ -1,6 +1,7 @@
+import os
 from optar.src.SiteReader import SiteReader
 from optar.src.Watcher import Watcher 
-from optar.src.SiteStore import SiteStore
+from optar.tests.MockSiteStore import SiteStore
 
 def test_search_sites__found():
 
@@ -31,7 +32,8 @@ def test_compare_sites():
             self._links[url] = [url]
         def get_nodes(self):
             return self._links
-
+    assert os.path.isdir("./cache/www.patricematz.de")
+    assert len(SiteStore.get_site_history("www.patricematz.de")) >= 2
     # the links given in this sites.txt should be to either local files, or a local mock server
     # this is not implemented, as it would be trivial but time consuming
     watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")