From 413f0703045b919727e77258cfe6b7e72d894198 Mon Sep 17 00:00:00 2001 From: Askill Date: Wed, 17 Jul 2024 19:45:06 +0200 Subject: [PATCH] refactored to utilize dependency injection to make code more testable, added some tests --- __init__.py | 0 dev.py | 5 +++- main.py | 5 +++- prod.py | 5 +++- readme.md | 3 ++- requirements.txt | 3 ++- src/Crawler.py | 11 ++++---- src/SiteStoreS3.py | 2 +- src/Watcher.py | 24 ++++++++++------- src/__init__.py | 0 tests/__init__.py | 0 tests/cache/2024-07-15_16-30-47.json | 1 + tests/cache/2024-07-16_16-30-47.json | 1 + tests/keywords.txt | 1 + tests/sites.txt | 1 + tests/watcher_test.py | 39 ++++++++++++++++++++++++++++ 16 files changed, 81 insertions(+), 20 deletions(-) create mode 100644 __init__.py create mode 100644 src/__init__.py create mode 100644 tests/__init__.py create mode 100644 tests/cache/2024-07-15_16-30-47.json create mode 100644 tests/cache/2024-07-16_16-30-47.json create mode 100644 tests/keywords.txt create mode 100644 tests/sites.txt create mode 100644 tests/watcher_test.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dev.py b/dev.py index e57f248..6e7887f 100644 --- a/dev.py +++ b/dev.py @@ -1,4 +1,7 @@ +from optar.src.Crawler import Crawler +from optar.src.SiteReader import SiteReader +from optar.src.SiteStoreS3 import SiteStoreS3 from src.Watcher import Watcher if __name__ == "__main__": - Watcher("./optar/sites.txt", "./optar/keywords.txt").watch() \ No newline at end of file + Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(),"./optar/sites.txt", "./optar/keywords.txt").watch(crawler=Crawler(1)) \ No newline at end of file diff --git a/main.py b/main.py index c208853..78cfa9c 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,7 @@ +from optar.src.Crawler import Crawler +from optar.src.SiteReader import SiteReader +from optar.src.SiteStoreS3 import SiteStoreS3 from src.Watcher import Watcher if __name__ == "__main__": - Watcher("./sites.txt", "./keywords.txt").watch(3600) \ No newline at end of file + Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(),"./sites.txt", "./keywords.txt").watch(crawler=Crawler(1), sleep=3600) \ No newline at end of file diff --git a/prod.py b/prod.py index 688b247..6db38e0 100644 --- a/prod.py +++ b/prod.py @@ -1,4 +1,7 @@ +from optar.src.Crawler import Crawler +from optar.src.SiteReader import SiteReader +from optar.src.SiteStoreS3 import SiteStoreS3 from src.Watcher import Watcher if __name__ == "__main__": - Watcher("./sites.txt", "./keywords.txt").watch() \ No newline at end of file + Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(), "./sites.txt", "./keywords.txt").watch(crawler=Crawler(1)) \ No newline at end of file diff --git a/readme.md b/readme.md index 02635e9..4010f78 100644 --- a/readme.md +++ b/readme.md @@ -1,5 +1,6 @@ # Optar -This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader. +This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader. Default timeout 1h, list of keywords and sites can be changed while the software is running. +Only retrieves static content, client side rendered content crawling is not implemented. diff --git a/requirements.txt b/requirements.txt index d1c12ed..84cc3ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ lxml==5.2.2 requests==2.32.3 trafilatura==1.11.0 beautifulsoup4==4.12.3 -boto3==1.34.144 \ No newline at end of file +boto3==1.34.144 +pytest==8.2.2 \ No newline at end of file diff --git a/src/Crawler.py b/src/Crawler.py index bcd4c1d..63a7ff9 100644 --- a/src/Crawler.py +++ b/src/Crawler.py @@ -8,7 +8,7 @@ from pathlib import Path class Crawler: url = "" # the url of the website to be checked - _links = dict() # dic. with all sites and urls on those sites + _links = dict() # dict with all sites and urls on those sites header_values = { 'Connection:': 'Keep-alive', 'name': 'Michael Foord', @@ -19,15 +19,16 @@ class Crawler: exclude = [ ] - def __init__(self, logger=None, exclude=None): + def __init__(self, depth=1, logger=None, exclude=None): if exclude: self.exclude += exclude if logger: self.logger = logger else: self.logger = logging.Logger( - name="star_crawler", level=logging.INFO) + name="optar", level=logging.INFO) self._links = dict() + self._depth = depth def get_nodes(self): return self._links @@ -41,7 +42,7 @@ class Crawler: with open(path, 'r') as fp: self._links = json.load(fp) - def run(self, root, limit, sleep_time=0): + def run(self, root, sleep_time=0): self.url = root unchecked = [(0, root)] @@ -72,7 +73,7 @@ class Crawler: n_links = [] for link in _links: - if link not in n_links and level < limit: + if link not in n_links and level < self._depth: if link.startswith("http"): n_links.append((level+1, link)) else: diff --git a/src/SiteStoreS3.py b/src/SiteStoreS3.py index 93f6f6c..6a05f56 100644 --- a/src/SiteStoreS3.py +++ b/src/SiteStoreS3.py @@ -20,7 +20,7 @@ class SiteStoreS3: if "Contents"not in result: return None # return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix - return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]]) + return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True) def get_site_links(self, path): s3 = boto3.resource('s3') diff --git a/src/Watcher.py b/src/Watcher.py index b54dcfd..e6be13e 100644 --- a/src/Watcher.py +++ b/src/Watcher.py @@ -3,13 +3,13 @@ from datetime import datetime from typing import List, Dict, Optional from deepdiff import DeepDiff -from src.Crawler import Crawler -from src.SiteReader import SiteReader -from src.SiteStoreS3 import SiteStoreS3 +from optar.src.Crawler import Crawler +from optar.src.SiteReader import SiteReader +from optar.src.SiteStoreS3 import SiteStoreS3 class Watcher: - def __init__(self, sites_source_path, keywords_source_path) -> None: + def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None: self.site_store = SiteStoreS3("optar-dev-cache") self.site_reader = SiteReader() self.keywords_source_path = keywords_source_path @@ -19,7 +19,7 @@ class Watcher: with open(path) as f: return f.read().splitlines() - def watch(self, sleep=-1): + def watch(self, crawler, sleep=-1): """start the watcher with the given interval :param arg: seconds between runs, -1 for single run @@ -33,8 +33,7 @@ class Watcher: for site in sites: - crawler = Crawler() - crawler.run(site, 1) + crawler.run(site) self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes()) contents = [self.get_new_content(site) for site in sites] @@ -47,11 +46,14 @@ class Watcher: print(matches) if sleep == -1: - return + return matches time.sleep(sleep) @staticmethod def remove_protocol(site): + # every protocol should have // + if "//" not in site: + return site return site.split('/')[2] def get_new_content(self, url) -> Dict[str, str]: @@ -65,10 +67,14 @@ class Watcher: else: news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}") if news: - sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])]) + sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news)) return sites_contents return {} + @staticmethod + def get_added_urls( news): + return [z.split("'")[1] for z in list(news["dictionary_item_added"])] + @staticmethod def search_sites(url, content, keywords: List[str]): if content is None: diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/cache/2024-07-15_16-30-47.json b/tests/cache/2024-07-15_16-30-47.json new file mode 100644 index 0000000..596bd0e --- /dev/null +++ b/tests/cache/2024-07-15_16-30-47.json @@ -0,0 +1 @@ +{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []} \ No newline at end of file diff --git a/tests/cache/2024-07-16_16-30-47.json b/tests/cache/2024-07-16_16-30-47.json new file mode 100644 index 0000000..37f41bc --- /dev/null +++ b/tests/cache/2024-07-16_16-30-47.json @@ -0,0 +1 @@ +{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/"], [1, "https://www.patricematz.de/CV"], [1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []} \ No newline at end of file diff --git a/tests/keywords.txt b/tests/keywords.txt new file mode 100644 index 0000000..553b856 --- /dev/null +++ b/tests/keywords.txt @@ -0,0 +1 @@ +Consultant \ No newline at end of file diff --git a/tests/sites.txt b/tests/sites.txt new file mode 100644 index 0000000..6b14489 --- /dev/null +++ b/tests/sites.txt @@ -0,0 +1 @@ +https://www.patricematz.de/ \ No newline at end of file diff --git a/tests/watcher_test.py b/tests/watcher_test.py new file mode 100644 index 0000000..047ec41 --- /dev/null +++ b/tests/watcher_test.py @@ -0,0 +1,39 @@ +from optar.src.SiteReader import SiteReader +from optar.src.Watcher import Watcher +from optar.src.SiteStore import SiteStore + +def test_search_sites__found(): + + x = Watcher.search_sites("test.com", "dfjgbnsdigubsdofgliusdbgsdiugbTESTfjgnsdgosd\n\nsdfboiuasdgf!0980", ["TEST"]) + assert x == [("test.com", "TEST")] + +def test_search_sites__not_found(): + + x = Watcher.search_sites("test.com", "dfjgbnsdigubsdofgliusdbgsdiugbfjgnsdgosd\n\nsdfboiuasdgf!0980", ["TEST", "testing"]) + assert x == [] + +def test_remove_protocol__https(): + res = Watcher.remove_protocol("https://www.google.com") + assert res == "www.google.com" + +def test_remove_protocol__http(): + res = Watcher.remove_protocol("http://www.google.com") + assert res == "www.google.com" + +def test_remove_protocol__none(): + res = Watcher.remove_protocol("www.google.com") + assert res == "www.google.com" + +def test_compare_sites(): + class MockCrawler: + _links = {} + def run(self, url): + self._links[url] = [url] + def get_nodes(self): + return self._links + + # the links given in this sites.txt should be to either local files, or a local mock server + # this is not implemented, as it would be trivial but time consuming + watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt") + assert [] == watcher.watch(MockCrawler()) +