refactored to utilize dependency injection to make code more testable, added some tests

2024-07-17 19:45:06 +02:00 · 2024-07-17 19:45:06 +02:00 · 413f070304
parent 19c05d4820
commit 413f070304
16 changed files with 81 additions and 20 deletions
--- a/init.py
+++ b/init.py
--- a/dev.py
+++ b/dev.py
@ -1,4 +1,7 @@
+from optar.src.Crawler import Crawler
+from optar.src.SiteReader import SiteReader
+from optar.src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher

 if __name__ == "__main__":
-    Watcher("./optar/sites.txt", "./optar/keywords.txt").watch()
+    Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(),"./optar/sites.txt", "./optar/keywords.txt").watch(crawler=Crawler(1))
--- a/main.py
+++ b/main.py
@ -1,4 +1,7 @@
+from optar.src.Crawler import Crawler
+from optar.src.SiteReader import SiteReader
+from optar.src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher

 if __name__ == "__main__":
-    Watcher("./sites.txt", "./keywords.txt").watch(3600)
+    Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(),"./sites.txt", "./keywords.txt").watch(crawler=Crawler(1), sleep=3600)
--- a/prod.py
+++ b/prod.py
@ -1,4 +1,7 @@
+from optar.src.Crawler import Crawler
+from optar.src.SiteReader import SiteReader
+from optar.src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher

 if __name__ == "__main__":
-    Watcher("./sites.txt", "./keywords.txt").watch()
+    Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(), "./sites.txt", "./keywords.txt").watch(crawler=Crawler(1))
--- a/readme.md
+++ b/readme.md
@ -1,5 +1,6 @@
 # Optar

-This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader. 
+This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader.
 Default timeout 1h, list of keywords and sites can be changed while the software is running.

+Only retrieves static content, client side rendered content crawling is not implemented.
--- a/requirements.txt
+++ b/requirements.txt
@ -3,4 +3,5 @@ lxml==5.2.2
 requests==2.32.3
 trafilatura==1.11.0
 beautifulsoup4==4.12.3
-boto3==1.34.144
+boto3==1.34.144
+pytest==8.2.2
--- a/src/Crawler.py
+++ b/src/Crawler.py
@ -8,7 +8,7 @@ from pathlib import Path

 class Crawler:
    url = ""                # the url of the website to be checked
-    _links = dict()          # dic. with all sites and urls on those sites
+    _links = dict()          # dict with all sites and urls on those sites
    header_values = {
        'Connection:': 'Keep-alive',
        'name': 'Michael Foord',
@ -19,15 +19,16 @@ class Crawler:
    exclude = [
    ]

-    def __init__(self,  logger=None, exclude=None):
+    def __init__(self, depth=1,   logger=None, exclude=None):
        if exclude:
            self.exclude += exclude
        if logger:
            self.logger = logger
        else:
            self.logger = logging.Logger(
-                name="star_crawler", level=logging.INFO)
+                name="optar", level=logging.INFO)
        self._links = dict() 
+        self._depth = depth
        
    def get_nodes(self):
        return self._links
@ -41,7 +42,7 @@ class Crawler:
        with open(path, 'r') as fp:
            self._links = json.load(fp)
            
-    def run(self, root, limit, sleep_time=0):
+    def run(self, root, sleep_time=0):
        self.url = root
        unchecked = [(0, root)]

@ -72,7 +73,7 @@ class Crawler:

            n_links = []
            for link in _links:
-                if link not in n_links and level < limit:
+                if link not in n_links and level < self._depth:
                    if link.startswith("http"):
                        n_links.append((level+1, link))
                    else:
--- a/src/SiteStoreS3.py
+++ b/src/SiteStoreS3.py
@ -20,7 +20,7 @@ class SiteStoreS3:
        if "Contents"not in result:
            return None
        # return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
-        return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]])
+        return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True)

    def get_site_links(self, path):
        s3 = boto3.resource('s3')
--- a/src/Watcher.py
+++ b/src/Watcher.py
@ -3,13 +3,13 @@ from datetime import datetime
 from typing import List, Dict, Optional
 from deepdiff import DeepDiff

-from src.Crawler import Crawler
-from src.SiteReader import SiteReader
-from src.SiteStoreS3 import SiteStoreS3
+from optar.src.Crawler import Crawler
+from optar.src.SiteReader import SiteReader
+from optar.src.SiteStoreS3 import SiteStoreS3


 class Watcher:
-    def __init__(self, sites_source_path, keywords_source_path) -> None:
+    def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
        self.site_store = SiteStoreS3("optar-dev-cache")
        self.site_reader = SiteReader()
        self.keywords_source_path = keywords_source_path
@ -19,7 +19,7 @@ class Watcher:
        with open(path) as f:
            return f.read().splitlines()

-    def watch(self, sleep=-1):
+    def watch(self, crawler, sleep=-1):
        """start the watcher with the given interval

        :param arg: seconds between runs, -1 for single run
@ -33,8 +33,7 @@ class Watcher:

            
            for site in sites:
-                crawler = Crawler()
-                crawler.run(site, 1)
+                crawler.run(site)
                self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())

            contents = [self.get_new_content(site) for site in sites]
@ -47,11 +46,14 @@ class Watcher:
            print(matches)
            
            if sleep == -1:
-                return
+                return matches
            time.sleep(sleep)

    @staticmethod
    def remove_protocol(site):
+        # every protocol should have // 
+        if "//" not in site:
+            return site
        return site.split('/')[2]

    def get_new_content(self, url) -> Dict[str, str]:
@ -65,10 +67,14 @@ class Watcher:
        else:
            news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
        if news:
-            sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])
+            sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
            return sites_contents
        return {}

+    @staticmethod
+    def get_added_urls( news):
+        return [z.split("'")[1] for z in list(news["dictionary_item_added"])]
+
    @staticmethod
    def search_sites(url, content, keywords: List[str]):
        if content is None:
--- a/src/init.py
+++ b/src/init.py
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/cache/2024-07-15_16-30-47.json
+++ b/tests/cache/2024-07-15_16-30-47.json
@ -0,0 +1 @@
+{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []}
--- a/tests/cache/2024-07-16_16-30-47.json
+++ b/tests/cache/2024-07-16_16-30-47.json
@ -0,0 +1 @@
+{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/"], [1, "https://www.patricematz.de/CV"], [1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []}
--- a/tests/keywords.txt
+++ b/tests/keywords.txt
@ -0,0 +1 @@
+Consultant
--- a/tests/sites.txt
+++ b/tests/sites.txt
@ -0,0 +1 @@
+https://www.patricematz.de/
--- a/tests/watcher_test.py
+++ b/tests/watcher_test.py
@ -0,0 +1,39 @@
+from optar.src.SiteReader import SiteReader
+from optar.src.Watcher import Watcher 
+from optar.src.SiteStore import SiteStore
+
+def test_search_sites__found():
+
+    x = Watcher.search_sites("test.com", "dfjgbnsdigubsdofgliusdbgsdiugbTESTfjgnsdgosd\n\nsdfboiuasdgf!0980", ["TEST"])
+    assert x == [("test.com", "TEST")]
+
+def test_search_sites__not_found():
+
+    x = Watcher.search_sites("test.com", "dfjgbnsdigubsdofgliusdbgsdiugbfjgnsdgosd\n\nsdfboiuasdgf!0980", ["TEST", "testing"])
+    assert x == []
+
+def test_remove_protocol__https():
+    res = Watcher.remove_protocol("https://www.google.com")
+    assert res == "www.google.com"
+
+def test_remove_protocol__http():
+    res = Watcher.remove_protocol("http://www.google.com")
+    assert res == "www.google.com"
+
+def test_remove_protocol__none():
+    res = Watcher.remove_protocol("www.google.com")
+    assert res == "www.google.com"
+
+def test_compare_sites():
+    class MockCrawler:
+        _links = {}
+        def run(self, url):
+            self._links[url] = [url]
+        def get_nodes(self):
+            return self._links
+
+    # the links given in this sites.txt should be to either local files, or a local mock server
+    # this is not implemented, as it would be trivial but time consuming
+    watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")
+    assert [] == watcher.watch(MockCrawler())
+
				`@ -0,0 +1 @@`
				`{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []}`