Merge pull request #1 from Askill/test

test
2024-07-17 20:34:56 +02:00 · 2024-07-17 20:34:56 +02:00 · 7ea7ca8125
parent 913af2a110 2d1c2b1370
commit 7ea7ca8125
14 changed files with 58 additions and 49 deletions
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@ -7,7 +7,7 @@ on:

 jobs:
  pytest:
-    name: 'Terraform'
+    name: 'Pytest'
    runs-on: ubuntu-latest
    strategy:
      matrix:
@ -31,5 +31,6 @@ jobs:
        python -m pip install --upgrade pip
        pip install -r requirements.txt
        pip install pytest==8.2.2
-
-        pytest tests/watcher_test.py --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
+    - name: run test
+      run: | 
+        pytest tests/watcher_test.py
--- a/dev.py
+++ b/dev.py
@ -1,6 +1,6 @@
-from optar.src.Crawler import Crawler
-from optar.src.SiteReader import SiteReader
-from optar.src.SiteStoreS3 import SiteStoreS3
+from src.Crawler import Crawler
+from src.SiteReader import SiteReader
+from src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher

 if __name__ == "__main__":
--- a/keywords.txt
+++ b/keywords.txt
@ -1 +1 @@
-Oktober
+Engineer
--- a/main.py
+++ b/main.py
@ -1,6 +1,6 @@
-from optar.src.Crawler import Crawler
-from optar.src.SiteReader import SiteReader
-from optar.src.SiteStoreS3 import SiteStoreS3
+from src.Crawler import Crawler
+from src.SiteReader import SiteReader
+from src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher

 if __name__ == "__main__":
--- a/prod.py
+++ b/prod.py
@ -1,6 +1,6 @@
-from optar.src.Crawler import Crawler
-from optar.src.SiteReader import SiteReader
-from optar.src.SiteStoreS3 import SiteStoreS3
+from src.Crawler import Crawler
+from src.SiteReader import SiteReader
+from src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher

 if __name__ == "__main__":
--- a/readme.md
+++ b/readme.md
@ -2,5 +2,5 @@

 This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader.
 Default timeout 1h, list of keywords and sites can be changed while the software is running.
-
+ 
 Only retrieves static content, client side rendered content crawling is not implemented.
--- a/sites.txt
+++ b/sites.txt
@ -1 +1 @@
-https://www.patricematz.de/
+https://www.patricematz.de/CV
--- a/src/SiteStore.py
+++ b/src/SiteStore.py
@ -1,19 +0,0 @@
-import json
-import os
-from typing import List, Optional
-
-
-class SiteStore:
-    def __init__(self):
-        pass
-
-    @staticmethod
-    def get_site_history(cache_path) -> Optional[list[str]]:
-        if not os.path.isdir(cache_path):
-            return None
-        return sorted(os.listdir(cache_path))
-
-    @staticmethod
-    def get_site_links(path):
-        with open(path, 'r') as fp:
-            return json.load(fp)
--- a/src/SiteStoreS3.py
+++ b/src/SiteStoreS3.py
@ -20,7 +20,7 @@ class SiteStoreS3:
        if "Contents"not in result:
            return None
        # return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
-        return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True)
+        return sorted([x["Key"][len(cache_path) :] for x in result["Contents"]], reverse=True)

    def get_site_links(self, path):
        s3 = boto3.resource('s3')
--- a/src/Watcher.py
+++ b/src/Watcher.py
@ -1,17 +1,13 @@
 import time
 from datetime import datetime
-from typing import List, Dict, Optional
+from typing import List, Dict
 from deepdiff import DeepDiff

-from optar.src.Crawler import Crawler
-from optar.src.SiteReader import SiteReader
-from optar.src.SiteStoreS3 import SiteStoreS3
-
-
 class Watcher:
+    # there should be a type hint for site_store and site_reader, referencing interfaces, which these implement, for better auto complete and DX
    def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
-        self.site_store = SiteStoreS3("optar-dev-cache")
-        self.site_reader = SiteReader()
+        self.site_store = site_store
+        self.site_reader = site_reader
        self.keywords_source_path = keywords_source_path
        self.sites_source_path = sites_source_path

@ -35,6 +31,8 @@ class Watcher:
            for site in sites:
                crawler.run(site)
                self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
+                # do NOT overload the target
+                time.sleep(1)

            contents = [self.get_new_content(site) for site in sites]
            # TODO: improve handleing of None
@ -62,10 +60,11 @@ class Watcher:

        if len(list_of_files) >= 2:
            prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
-            current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
-            news = DeepDiff(prev_version, current_version, ignore_order=True)
        else:
-            news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
+            prev_version = {url: []}
+        current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
+        news = DeepDiff(prev_version, current_version, ignore_order=True)
+
        if news:
            sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
            return sites_contents
@ -73,7 +72,7 @@ class Watcher:

    @staticmethod
    def get_added_urls( news):
-        return [z.split("'")[1] for z in list(news["dictionary_item_added"])]
+        return [z.split("'")[1] for z in list(news["iterable_item_added"])]

    @staticmethod
    def search_sites(url, content, keywords: List[str]):
--- a/tests/MockSiteStore.py
+++ b/tests/MockSiteStore.py
@ -0,0 +1,26 @@
+import json
+import os
+from pathlib import Path
+from typing import List, Optional
+
+
+class SiteStore:
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def get_site_history(in_path) -> Optional[list[str]]:
+        cache_path = "./cache/" + in_path
+        if not os.path.isdir(cache_path):
+            return []
+        return sorted(os.listdir(cache_path))
+
+    @staticmethod
+    def get_site_links(in_path):
+        cache_path = "./cache/" + in_path
+        with open(cache_path, 'r') as fp:
+            return json.load(fp)
+        
+    @staticmethod
+    def persist(self, data):
+        return
--- a/tests/cache/www.patricematz.de/2024-07-15_16-30-47.json
+++ b/tests/cache/www.patricematz.de/2024-07-15_16-30-47.json
--- a/tests/cache/www.patricematz.de/2024-07-16_16-30-47.json
+++ b/tests/cache/www.patricematz.de/2024-07-16_16-30-47.json
--- a/tests/watcher_test.py
+++ b/tests/watcher_test.py
@ -1,6 +1,7 @@
+import os
 from optar.src.SiteReader import SiteReader
 from optar.src.Watcher import Watcher 
-from optar.src.SiteStore import SiteStore
+from optar.tests.MockSiteStore import SiteStore

 def test_search_sites__found():

@ -31,7 +32,8 @@ def test_compare_sites():
            self._links[url] = [url]
        def get_nodes(self):
            return self._links
-
+    assert os.path.isdir("./cache/www.patricematz.de")
+    assert len(SiteStore.get_site_history("www.patricematz.de")) >= 2
    # the links given in this sites.txt should be to either local files, or a local mock server
    # this is not implemented, as it would be trivial but time consuming
    watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")