From aa37a25f1c5e34583951c521edcf6634c4fec175 Mon Sep 17 00:00:00 2001 From: Askill Date: Wed, 17 Jul 2024 20:32:59 +0200 Subject: [PATCH] fixed tests --- dev.py | 6 ++--- keywords.txt | 2 +- main.py | 6 ++--- prod.py | 6 ++--- sites.txt | 2 +- src/SiteStore.py | 19 -------------- src/SiteStoreS3.py | 2 +- src/Watcher.py | 23 ++++++++-------- tests/MockSiteStore.py | 26 +++++++++++++++++++ .../2024-07-15_16-30-47.json | 0 .../2024-07-16_16-30-47.json | 0 tests/watcher_test.py | 6 +++-- 12 files changed, 53 insertions(+), 45 deletions(-) delete mode 100644 src/SiteStore.py create mode 100644 tests/MockSiteStore.py rename tests/cache/{ => www.patricematz.de}/2024-07-15_16-30-47.json (100%) rename tests/cache/{ => www.patricematz.de}/2024-07-16_16-30-47.json (100%) diff --git a/dev.py b/dev.py index 6e7887f..101c559 100644 --- a/dev.py +++ b/dev.py @@ -1,6 +1,6 @@ -from optar.src.Crawler import Crawler -from optar.src.SiteReader import SiteReader -from optar.src.SiteStoreS3 import SiteStoreS3 +from src.Crawler import Crawler +from src.SiteReader import SiteReader +from src.SiteStoreS3 import SiteStoreS3 from src.Watcher import Watcher if __name__ == "__main__": diff --git a/keywords.txt b/keywords.txt index 104fafc..f00bdb9 100644 --- a/keywords.txt +++ b/keywords.txt @@ -1 +1 @@ -Oktober \ No newline at end of file +Engineer \ No newline at end of file diff --git a/main.py b/main.py index 78cfa9c..1f174a1 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,6 @@ -from optar.src.Crawler import Crawler -from optar.src.SiteReader import SiteReader -from optar.src.SiteStoreS3 import SiteStoreS3 +from src.Crawler import Crawler +from src.SiteReader import SiteReader +from src.SiteStoreS3 import SiteStoreS3 from src.Watcher import Watcher if __name__ == "__main__": diff --git a/prod.py b/prod.py index 6db38e0..44e5068 100644 --- a/prod.py +++ b/prod.py @@ -1,6 +1,6 @@ -from optar.src.Crawler import Crawler -from optar.src.SiteReader import SiteReader -from optar.src.SiteStoreS3 import SiteStoreS3 +from src.Crawler import Crawler +from src.SiteReader import SiteReader +from src.SiteStoreS3 import SiteStoreS3 from src.Watcher import Watcher if __name__ == "__main__": diff --git a/sites.txt b/sites.txt index 6b14489..ad85345 100644 --- a/sites.txt +++ b/sites.txt @@ -1 +1 @@ -https://www.patricematz.de/ \ No newline at end of file +https://www.patricematz.de/CV \ No newline at end of file diff --git a/src/SiteStore.py b/src/SiteStore.py deleted file mode 100644 index 322d6cf..0000000 --- a/src/SiteStore.py +++ /dev/null @@ -1,19 +0,0 @@ -import json -import os -from typing import List, Optional - - -class SiteStore: - def __init__(self): - pass - - @staticmethod - def get_site_history(cache_path) -> Optional[list[str]]: - if not os.path.isdir(cache_path): - return None - return sorted(os.listdir(cache_path)) - - @staticmethod - def get_site_links(path): - with open(path, 'r') as fp: - return json.load(fp) diff --git a/src/SiteStoreS3.py b/src/SiteStoreS3.py index 6a05f56..74317c1 100644 --- a/src/SiteStoreS3.py +++ b/src/SiteStoreS3.py @@ -20,7 +20,7 @@ class SiteStoreS3: if "Contents"not in result: return None # return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix - return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True) + return sorted([x["Key"][len(cache_path) :] for x in result["Contents"]], reverse=True) def get_site_links(self, path): s3 = boto3.resource('s3') diff --git a/src/Watcher.py b/src/Watcher.py index e6be13e..4a40b22 100644 --- a/src/Watcher.py +++ b/src/Watcher.py @@ -1,17 +1,13 @@ import time from datetime import datetime -from typing import List, Dict, Optional +from typing import List, Dict from deepdiff import DeepDiff -from optar.src.Crawler import Crawler -from optar.src.SiteReader import SiteReader -from optar.src.SiteStoreS3 import SiteStoreS3 - - class Watcher: + # there should be a type hint for site_store and site_reader, referencing interfaces, which these implement, for better auto complete and DX def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None: - self.site_store = SiteStoreS3("optar-dev-cache") - self.site_reader = SiteReader() + self.site_store = site_store + self.site_reader = site_reader self.keywords_source_path = keywords_source_path self.sites_source_path = sites_source_path @@ -35,6 +31,8 @@ class Watcher: for site in sites: crawler.run(site) self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes()) + # do NOT overload the target + time.sleep(1) contents = [self.get_new_content(site) for site in sites] # TODO: improve handleing of None @@ -62,10 +60,11 @@ class Watcher: if len(list_of_files) >= 2: prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}") - current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}") - news = DeepDiff(prev_version, current_version, ignore_order=True) else: - news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}") + prev_version = {url: []} + current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}") + news = DeepDiff(prev_version, current_version, ignore_order=True) + if news: sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news)) return sites_contents @@ -73,7 +72,7 @@ class Watcher: @staticmethod def get_added_urls( news): - return [z.split("'")[1] for z in list(news["dictionary_item_added"])] + return [z.split("'")[1] for z in list(news["iterable_item_added"])] @staticmethod def search_sites(url, content, keywords: List[str]): diff --git a/tests/MockSiteStore.py b/tests/MockSiteStore.py new file mode 100644 index 0000000..712988f --- /dev/null +++ b/tests/MockSiteStore.py @@ -0,0 +1,26 @@ +import json +import os +from pathlib import Path +from typing import List, Optional + + +class SiteStore: + def __init__(self): + pass + + @staticmethod + def get_site_history(in_path) -> Optional[list[str]]: + cache_path = "./cache/" + in_path + if not os.path.isdir(cache_path): + return [] + return sorted(os.listdir(cache_path)) + + @staticmethod + def get_site_links(in_path): + cache_path = "./cache/" + in_path + with open(cache_path, 'r') as fp: + return json.load(fp) + + @staticmethod + def persist(self, data): + return \ No newline at end of file diff --git a/tests/cache/2024-07-15_16-30-47.json b/tests/cache/www.patricematz.de/2024-07-15_16-30-47.json similarity index 100% rename from tests/cache/2024-07-15_16-30-47.json rename to tests/cache/www.patricematz.de/2024-07-15_16-30-47.json diff --git a/tests/cache/2024-07-16_16-30-47.json b/tests/cache/www.patricematz.de/2024-07-16_16-30-47.json similarity index 100% rename from tests/cache/2024-07-16_16-30-47.json rename to tests/cache/www.patricematz.de/2024-07-16_16-30-47.json diff --git a/tests/watcher_test.py b/tests/watcher_test.py index 047ec41..ae0d79e 100644 --- a/tests/watcher_test.py +++ b/tests/watcher_test.py @@ -1,6 +1,7 @@ +import os from optar.src.SiteReader import SiteReader from optar.src.Watcher import Watcher -from optar.src.SiteStore import SiteStore +from optar.tests.MockSiteStore import SiteStore def test_search_sites__found(): @@ -31,7 +32,8 @@ def test_compare_sites(): self._links[url] = [url] def get_nodes(self): return self._links - + assert os.path.isdir("./cache/www.patricematz.de") + assert len(SiteStore.get_site_history("www.patricematz.de")) >= 2 # the links given in this sites.txt should be to either local files, or a local mock server # this is not implemented, as it would be trivial but time consuming watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")