From 61dd1190b15de274ce9a448f425a73329e6b549b Mon Sep 17 00:00:00 2001 From: Askill Date: Wed, 17 Jul 2024 19:54:31 +0200 Subject: [PATCH 1/5] test --- readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 01b9d75..0fc21e4 100644 --- a/readme.md +++ b/readme.md @@ -2,5 +2,5 @@ This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader. Default timeout 1h, list of keywords and sites can be changed while the software is running. - + Only retrieves static content, client side rendered content crawling is not implemented. From 4b4b5c4b66bfb41bdcf7d05dee31065595038147 Mon Sep 17 00:00:00 2001 From: Askill Date: Wed, 17 Jul 2024 19:55:36 +0200 Subject: [PATCH 2/5] simplified test output --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 04ea076..cc5e51b 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -32,4 +32,4 @@ jobs: pip install -r requirements.txt pip install pytest==8.2.2 - pytest tests/watcher_test.py --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html \ No newline at end of file + pytest tests/watcher_test.py \ No newline at end of file From 83c3e3b54a52b6ad43c80b9d80d9d61f789d35da Mon Sep 17 00:00:00 2001 From: Askill Date: Wed, 17 Jul 2024 19:56:14 +0200 Subject: [PATCH 3/5] named test pytest --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index cc5e51b..3304c70 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -7,7 +7,7 @@ on: jobs: pytest: - name: 'Terraform' + name: 'Pytest' runs-on: ubuntu-latest strategy: matrix: From aa37a25f1c5e34583951c521edcf6634c4fec175 Mon Sep 17 00:00:00 2001 From: Askill Date: Wed, 17 Jul 2024 20:32:59 +0200 Subject: [PATCH 4/5] fixed tests --- dev.py | 6 ++--- keywords.txt | 2 +- main.py | 6 ++--- prod.py | 6 ++--- sites.txt | 2 +- src/SiteStore.py | 19 -------------- src/SiteStoreS3.py | 2 +- src/Watcher.py | 23 ++++++++-------- tests/MockSiteStore.py | 26 +++++++++++++++++++ .../2024-07-15_16-30-47.json | 0 .../2024-07-16_16-30-47.json | 0 tests/watcher_test.py | 6 +++-- 12 files changed, 53 insertions(+), 45 deletions(-) delete mode 100644 src/SiteStore.py create mode 100644 tests/MockSiteStore.py rename tests/cache/{ => www.patricematz.de}/2024-07-15_16-30-47.json (100%) rename tests/cache/{ => www.patricematz.de}/2024-07-16_16-30-47.json (100%) diff --git a/dev.py b/dev.py index 6e7887f..101c559 100644 --- a/dev.py +++ b/dev.py @@ -1,6 +1,6 @@ -from optar.src.Crawler import Crawler -from optar.src.SiteReader import SiteReader -from optar.src.SiteStoreS3 import SiteStoreS3 +from src.Crawler import Crawler +from src.SiteReader import SiteReader +from src.SiteStoreS3 import SiteStoreS3 from src.Watcher import Watcher if __name__ == "__main__": diff --git a/keywords.txt b/keywords.txt index 104fafc..f00bdb9 100644 --- a/keywords.txt +++ b/keywords.txt @@ -1 +1 @@ -Oktober \ No newline at end of file +Engineer \ No newline at end of file diff --git a/main.py b/main.py index 78cfa9c..1f174a1 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,6 @@ -from optar.src.Crawler import Crawler -from optar.src.SiteReader import SiteReader -from optar.src.SiteStoreS3 import SiteStoreS3 +from src.Crawler import Crawler +from src.SiteReader import SiteReader +from src.SiteStoreS3 import SiteStoreS3 from src.Watcher import Watcher if __name__ == "__main__": diff --git a/prod.py b/prod.py index 6db38e0..44e5068 100644 --- a/prod.py +++ b/prod.py @@ -1,6 +1,6 @@ -from optar.src.Crawler import Crawler -from optar.src.SiteReader import SiteReader -from optar.src.SiteStoreS3 import SiteStoreS3 +from src.Crawler import Crawler +from src.SiteReader import SiteReader +from src.SiteStoreS3 import SiteStoreS3 from src.Watcher import Watcher if __name__ == "__main__": diff --git a/sites.txt b/sites.txt index 6b14489..ad85345 100644 --- a/sites.txt +++ b/sites.txt @@ -1 +1 @@ -https://www.patricematz.de/ \ No newline at end of file +https://www.patricematz.de/CV \ No newline at end of file diff --git a/src/SiteStore.py b/src/SiteStore.py deleted file mode 100644 index 322d6cf..0000000 --- a/src/SiteStore.py +++ /dev/null @@ -1,19 +0,0 @@ -import json -import os -from typing import List, Optional - - -class SiteStore: - def __init__(self): - pass - - @staticmethod - def get_site_history(cache_path) -> Optional[list[str]]: - if not os.path.isdir(cache_path): - return None - return sorted(os.listdir(cache_path)) - - @staticmethod - def get_site_links(path): - with open(path, 'r') as fp: - return json.load(fp) diff --git a/src/SiteStoreS3.py b/src/SiteStoreS3.py index 6a05f56..74317c1 100644 --- a/src/SiteStoreS3.py +++ b/src/SiteStoreS3.py @@ -20,7 +20,7 @@ class SiteStoreS3: if "Contents"not in result: return None # return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix - return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True) + return sorted([x["Key"][len(cache_path) :] for x in result["Contents"]], reverse=True) def get_site_links(self, path): s3 = boto3.resource('s3') diff --git a/src/Watcher.py b/src/Watcher.py index e6be13e..4a40b22 100644 --- a/src/Watcher.py +++ b/src/Watcher.py @@ -1,17 +1,13 @@ import time from datetime import datetime -from typing import List, Dict, Optional +from typing import List, Dict from deepdiff import DeepDiff -from optar.src.Crawler import Crawler -from optar.src.SiteReader import SiteReader -from optar.src.SiteStoreS3 import SiteStoreS3 - - class Watcher: + # there should be a type hint for site_store and site_reader, referencing interfaces, which these implement, for better auto complete and DX def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None: - self.site_store = SiteStoreS3("optar-dev-cache") - self.site_reader = SiteReader() + self.site_store = site_store + self.site_reader = site_reader self.keywords_source_path = keywords_source_path self.sites_source_path = sites_source_path @@ -35,6 +31,8 @@ class Watcher: for site in sites: crawler.run(site) self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes()) + # do NOT overload the target + time.sleep(1) contents = [self.get_new_content(site) for site in sites] # TODO: improve handleing of None @@ -62,10 +60,11 @@ class Watcher: if len(list_of_files) >= 2: prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}") - current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}") - news = DeepDiff(prev_version, current_version, ignore_order=True) else: - news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}") + prev_version = {url: []} + current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}") + news = DeepDiff(prev_version, current_version, ignore_order=True) + if news: sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news)) return sites_contents @@ -73,7 +72,7 @@ class Watcher: @staticmethod def get_added_urls( news): - return [z.split("'")[1] for z in list(news["dictionary_item_added"])] + return [z.split("'")[1] for z in list(news["iterable_item_added"])] @staticmethod def search_sites(url, content, keywords: List[str]): diff --git a/tests/MockSiteStore.py b/tests/MockSiteStore.py new file mode 100644 index 0000000..712988f --- /dev/null +++ b/tests/MockSiteStore.py @@ -0,0 +1,26 @@ +import json +import os +from pathlib import Path +from typing import List, Optional + + +class SiteStore: + def __init__(self): + pass + + @staticmethod + def get_site_history(in_path) -> Optional[list[str]]: + cache_path = "./cache/" + in_path + if not os.path.isdir(cache_path): + return [] + return sorted(os.listdir(cache_path)) + + @staticmethod + def get_site_links(in_path): + cache_path = "./cache/" + in_path + with open(cache_path, 'r') as fp: + return json.load(fp) + + @staticmethod + def persist(self, data): + return \ No newline at end of file diff --git a/tests/cache/2024-07-15_16-30-47.json b/tests/cache/www.patricematz.de/2024-07-15_16-30-47.json similarity index 100% rename from tests/cache/2024-07-15_16-30-47.json rename to tests/cache/www.patricematz.de/2024-07-15_16-30-47.json diff --git a/tests/cache/2024-07-16_16-30-47.json b/tests/cache/www.patricematz.de/2024-07-16_16-30-47.json similarity index 100% rename from tests/cache/2024-07-16_16-30-47.json rename to tests/cache/www.patricematz.de/2024-07-16_16-30-47.json diff --git a/tests/watcher_test.py b/tests/watcher_test.py index 047ec41..ae0d79e 100644 --- a/tests/watcher_test.py +++ b/tests/watcher_test.py @@ -1,6 +1,7 @@ +import os from optar.src.SiteReader import SiteReader from optar.src.Watcher import Watcher -from optar.src.SiteStore import SiteStore +from optar.tests.MockSiteStore import SiteStore def test_search_sites__found(): @@ -31,7 +32,8 @@ def test_compare_sites(): self._links[url] = [url] def get_nodes(self): return self._links - + assert os.path.isdir("./cache/www.patricematz.de") + assert len(SiteStore.get_site_history("www.patricematz.de")) >= 2 # the links given in this sites.txt should be to either local files, or a local mock server # this is not implemented, as it would be trivial but time consuming watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt") From 2d1c2b137085d93ff97e875cc784aa90c31f65c9 Mon Sep 17 00:00:00 2001 From: Askill Date: Wed, 17 Jul 2024 20:34:25 +0200 Subject: [PATCH 5/5] split action into 2, install deps and run test --- .github/workflows/pytest.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 3304c70..03b68c7 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -31,5 +31,6 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt pip install pytest==8.2.2 - + - name: run test + run: | pytest tests/watcher_test.py \ No newline at end of file