From 413f0703045b919727e77258cfe6b7e72d894198 Mon Sep 17 00:00:00 2001
From: Askill <ofjokg@gmail.com>
Date: Wed, 17 Jul 2024 19:45:06 +0200
Subject: [PATCH] refactored to utilize dependency injection to make code more
 testable, added some tests

---
 __init__.py                          |  0
 dev.py                               |  5 +++-
 main.py                              |  5 +++-
 prod.py                              |  5 +++-
 readme.md                            |  3 ++-
 requirements.txt                     |  3 ++-
 src/Crawler.py                       | 11 ++++----
 src/SiteStoreS3.py                   |  2 +-
 src/Watcher.py                       | 24 ++++++++++-------
 src/__init__.py                      |  0
 tests/__init__.py                    |  0
 tests/cache/2024-07-15_16-30-47.json |  1 +
 tests/cache/2024-07-16_16-30-47.json |  1 +
 tests/keywords.txt                   |  1 +
 tests/sites.txt                      |  1 +
 tests/watcher_test.py                | 39 ++++++++++++++++++++++++++++
 16 files changed, 81 insertions(+), 20 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 src/__init__.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/cache/2024-07-15_16-30-47.json
 create mode 100644 tests/cache/2024-07-16_16-30-47.json
 create mode 100644 tests/keywords.txt
 create mode 100644 tests/sites.txt
 create mode 100644 tests/watcher_test.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dev.py b/dev.py
index e57f248..6e7887f 100644
--- a/dev.py
+++ b/dev.py
@@ -1,4 +1,7 @@
+from optar.src.Crawler import Crawler
+from optar.src.SiteReader import SiteReader
+from optar.src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher
 
 if __name__ == "__main__":
-    Watcher("./optar/sites.txt", "./optar/keywords.txt").watch()
\ No newline at end of file
+    Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(),"./optar/sites.txt", "./optar/keywords.txt").watch(crawler=Crawler(1))
\ No newline at end of file
diff --git a/main.py b/main.py
index c208853..78cfa9c 100644
--- a/main.py
+++ b/main.py
@@ -1,4 +1,7 @@
+from optar.src.Crawler import Crawler
+from optar.src.SiteReader import SiteReader
+from optar.src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher
 
 if __name__ == "__main__":
-    Watcher("./sites.txt", "./keywords.txt").watch(3600)
\ No newline at end of file
+    Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(),"./sites.txt", "./keywords.txt").watch(crawler=Crawler(1), sleep=3600)
\ No newline at end of file
diff --git a/prod.py b/prod.py
index 688b247..6db38e0 100644
--- a/prod.py
+++ b/prod.py
@@ -1,4 +1,7 @@
+from optar.src.Crawler import Crawler
+from optar.src.SiteReader import SiteReader
+from optar.src.SiteStoreS3 import SiteStoreS3
 from src.Watcher import Watcher
 
 if __name__ == "__main__":
-    Watcher("./sites.txt", "./keywords.txt").watch()
\ No newline at end of file
+    Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(), "./sites.txt", "./keywords.txt").watch(crawler=Crawler(1))
\ No newline at end of file
diff --git a/readme.md b/readme.md
index 02635e9..4010f78 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,6 @@
 # Optar
 
-This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader. 
+This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader.
 Default timeout 1h, list of keywords and sites can be changed while the software is running.
 
+Only retrieves static content, client side rendered content crawling is not implemented.
diff --git a/requirements.txt b/requirements.txt
index d1c12ed..84cc3ca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ lxml==5.2.2
 requests==2.32.3
 trafilatura==1.11.0
 beautifulsoup4==4.12.3
-boto3==1.34.144
\ No newline at end of file
+boto3==1.34.144
+pytest==8.2.2
\ No newline at end of file
diff --git a/src/Crawler.py b/src/Crawler.py
index bcd4c1d..63a7ff9 100644
--- a/src/Crawler.py
+++ b/src/Crawler.py
@@ -8,7 +8,7 @@ from pathlib import Path
 
 class Crawler:
     url = ""                # the url of the website to be checked
-    _links = dict()          # dic. with all sites and urls on those sites
+    _links = dict()          # dict with all sites and urls on those sites
     header_values = {
         'Connection:': 'Keep-alive',
         'name': 'Michael Foord',
@@ -19,15 +19,16 @@ class Crawler:
     exclude = [
     ]
 
-    def __init__(self,  logger=None, exclude=None):
+    def __init__(self, depth=1,   logger=None, exclude=None):
         if exclude:
             self.exclude += exclude
         if logger:
             self.logger = logger
         else:
             self.logger = logging.Logger(
-                name="star_crawler", level=logging.INFO)
+                name="optar", level=logging.INFO)
         self._links = dict() 
+        self._depth = depth
         
     def get_nodes(self):
         return self._links
@@ -41,7 +42,7 @@ class Crawler:
         with open(path, 'r') as fp:
             self._links = json.load(fp)
             
-    def run(self, root, limit, sleep_time=0):
+    def run(self, root, sleep_time=0):
         self.url = root
         unchecked = [(0, root)]
 
@@ -72,7 +73,7 @@ class Crawler:
 
             n_links = []
             for link in _links:
-                if link not in n_links and level < limit:
+                if link not in n_links and level < self._depth:
                     if link.startswith("http"):
                         n_links.append((level+1, link))
                     else:
diff --git a/src/SiteStoreS3.py b/src/SiteStoreS3.py
index 93f6f6c..6a05f56 100644
--- a/src/SiteStoreS3.py
+++ b/src/SiteStoreS3.py
@@ -20,7 +20,7 @@ class SiteStoreS3:
         if "Contents"not in result:
             return None
         # return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
-        return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]])
+        return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True)
 
     def get_site_links(self, path):
         s3 = boto3.resource('s3')
diff --git a/src/Watcher.py b/src/Watcher.py
index b54dcfd..e6be13e 100644
--- a/src/Watcher.py
+++ b/src/Watcher.py
@@ -3,13 +3,13 @@ from datetime import datetime
 from typing import List, Dict, Optional
 from deepdiff import DeepDiff
 
-from src.Crawler import Crawler
-from src.SiteReader import SiteReader
-from src.SiteStoreS3 import SiteStoreS3
+from optar.src.Crawler import Crawler
+from optar.src.SiteReader import SiteReader
+from optar.src.SiteStoreS3 import SiteStoreS3
 
 
 class Watcher:
-    def __init__(self, sites_source_path, keywords_source_path) -> None:
+    def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
         self.site_store = SiteStoreS3("optar-dev-cache")
         self.site_reader = SiteReader()
         self.keywords_source_path = keywords_source_path
@@ -19,7 +19,7 @@ class Watcher:
         with open(path) as f:
             return f.read().splitlines()
 
-    def watch(self, sleep=-1):
+    def watch(self, crawler, sleep=-1):
         """start the watcher with the given interval
 
         :param arg: seconds between runs, -1 for single run
@@ -33,8 +33,7 @@ class Watcher:
 
             
             for site in sites:
-                crawler = Crawler()
-                crawler.run(site, 1)
+                crawler.run(site)
                 self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
 
             contents = [self.get_new_content(site) for site in sites]
@@ -47,11 +46,14 @@ class Watcher:
             print(matches)
             
             if sleep == -1:
-                return
+                return matches
             time.sleep(sleep)
 
     @staticmethod
     def remove_protocol(site):
+        # every protocol should have // 
+        if "//" not in site:
+            return site
         return site.split('/')[2]
 
     def get_new_content(self, url) -> Dict[str, str]:
@@ -65,10 +67,14 @@ class Watcher:
         else:
             news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
         if news:
-            sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])
+            sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
             return sites_contents
         return {}
 
+    @staticmethod
+    def get_added_urls( news):
+        return [z.split("'")[1] for z in list(news["dictionary_item_added"])]
+
     @staticmethod
     def search_sites(url, content, keywords: List[str]):
         if content is None:
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/cache/2024-07-15_16-30-47.json b/tests/cache/2024-07-15_16-30-47.json
new file mode 100644
index 0000000..596bd0e
--- /dev/null
+++ b/tests/cache/2024-07-15_16-30-47.json
@@ -0,0 +1 @@
+{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []}
\ No newline at end of file
diff --git a/tests/cache/2024-07-16_16-30-47.json b/tests/cache/2024-07-16_16-30-47.json
new file mode 100644
index 0000000..37f41bc
--- /dev/null
+++ b/tests/cache/2024-07-16_16-30-47.json
@@ -0,0 +1 @@
+{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/"], [1, "https://www.patricematz.de/CV"], [1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []}
\ No newline at end of file
diff --git a/tests/keywords.txt b/tests/keywords.txt
new file mode 100644
index 0000000..553b856
--- /dev/null
+++ b/tests/keywords.txt
@@ -0,0 +1 @@
+Consultant
\ No newline at end of file
diff --git a/tests/sites.txt b/tests/sites.txt
new file mode 100644
index 0000000..6b14489
--- /dev/null
+++ b/tests/sites.txt
@@ -0,0 +1 @@
+https://www.patricematz.de/
\ No newline at end of file
diff --git a/tests/watcher_test.py b/tests/watcher_test.py
new file mode 100644
index 0000000..047ec41
--- /dev/null
+++ b/tests/watcher_test.py
@@ -0,0 +1,39 @@
+from optar.src.SiteReader import SiteReader
+from optar.src.Watcher import Watcher 
+from optar.src.SiteStore import SiteStore
+
+def test_search_sites__found():
+
+    x = Watcher.search_sites("test.com", "dfjgbnsdigubsdofgliusdbgsdiugbTESTfjgnsdgosd\n\nsdfboiuasdgf!0980", ["TEST"])
+    assert x == [("test.com", "TEST")]
+
+def test_search_sites__not_found():
+
+    x = Watcher.search_sites("test.com", "dfjgbnsdigubsdofgliusdbgsdiugbfjgnsdgosd\n\nsdfboiuasdgf!0980", ["TEST", "testing"])
+    assert x == []
+
+def test_remove_protocol__https():
+    res = Watcher.remove_protocol("https://www.google.com")
+    assert res == "www.google.com"
+
+def test_remove_protocol__http():
+    res = Watcher.remove_protocol("http://www.google.com")
+    assert res == "www.google.com"
+
+def test_remove_protocol__none():
+    res = Watcher.remove_protocol("www.google.com")
+    assert res == "www.google.com"
+
+def test_compare_sites():
+    class MockCrawler:
+        _links = {}
+        def run(self, url):
+            self._links[url] = [url]
+        def get_nodes(self):
+            return self._links
+
+    # the links given in this sites.txt should be to either local files, or a local mock server
+    # this is not implemented, as it would be trivial but time consuming
+    watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")
+    assert [] == watcher.watch(MockCrawler())
+