refactored to utilize dependency injection to make code more testable, added some tests

This commit is contained in:
Askill 2024-07-17 19:45:06 +02:00
parent 19c05d4820
commit 413f070304
16 changed files with 81 additions and 20 deletions

0
__init__.py Normal file
View File

5
dev.py
View File

@ -1,4 +1,7 @@
from optar.src.Crawler import Crawler
from optar.src.SiteReader import SiteReader
from optar.src.SiteStoreS3 import SiteStoreS3
from src.Watcher import Watcher
if __name__ == "__main__":
Watcher("./optar/sites.txt", "./optar/keywords.txt").watch()
Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(),"./optar/sites.txt", "./optar/keywords.txt").watch(crawler=Crawler(1))

View File

@ -1,4 +1,7 @@
from optar.src.Crawler import Crawler
from optar.src.SiteReader import SiteReader
from optar.src.SiteStoreS3 import SiteStoreS3
from src.Watcher import Watcher
if __name__ == "__main__":
Watcher("./sites.txt", "./keywords.txt").watch(3600)
Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(),"./sites.txt", "./keywords.txt").watch(crawler=Crawler(1), sleep=3600)

View File

@ -1,4 +1,7 @@
from optar.src.Crawler import Crawler
from optar.src.SiteReader import SiteReader
from optar.src.SiteStoreS3 import SiteStoreS3
from src.Watcher import Watcher
if __name__ == "__main__":
Watcher("./sites.txt", "./keywords.txt").watch()
Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(), "./sites.txt", "./keywords.txt").watch(crawler=Crawler(1))

View File

@ -1,5 +1,6 @@
# Optar
This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader.
This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader.
Default timeout 1h, list of keywords and sites can be changed while the software is running.
Only retrieves static content, client side rendered content crawling is not implemented.

View File

@ -3,4 +3,5 @@ lxml==5.2.2
requests==2.32.3
trafilatura==1.11.0
beautifulsoup4==4.12.3
boto3==1.34.144
boto3==1.34.144
pytest==8.2.2

View File

@ -8,7 +8,7 @@ from pathlib import Path
class Crawler:
url = "" # the url of the website to be checked
_links = dict() # dic. with all sites and urls on those sites
_links = dict() # dict with all sites and urls on those sites
header_values = {
'Connection:': 'Keep-alive',
'name': 'Michael Foord',
@ -19,15 +19,16 @@ class Crawler:
exclude = [
]
def __init__(self, logger=None, exclude=None):
def __init__(self, depth=1, logger=None, exclude=None):
if exclude:
self.exclude += exclude
if logger:
self.logger = logger
else:
self.logger = logging.Logger(
name="star_crawler", level=logging.INFO)
name="optar", level=logging.INFO)
self._links = dict()
self._depth = depth
def get_nodes(self):
return self._links
@ -41,7 +42,7 @@ class Crawler:
with open(path, 'r') as fp:
self._links = json.load(fp)
def run(self, root, limit, sleep_time=0):
def run(self, root, sleep_time=0):
self.url = root
unchecked = [(0, root)]
@ -72,7 +73,7 @@ class Crawler:
n_links = []
for link in _links:
if link not in n_links and level < limit:
if link not in n_links and level < self._depth:
if link.startswith("http"):
n_links.append((level+1, link))
else:

View File

@ -20,7 +20,7 @@ class SiteStoreS3:
if "Contents"not in result:
return None
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]])
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True)
def get_site_links(self, path):
s3 = boto3.resource('s3')

View File

@ -3,13 +3,13 @@ from datetime import datetime
from typing import List, Dict, Optional
from deepdiff import DeepDiff
from src.Crawler import Crawler
from src.SiteReader import SiteReader
from src.SiteStoreS3 import SiteStoreS3
from optar.src.Crawler import Crawler
from optar.src.SiteReader import SiteReader
from optar.src.SiteStoreS3 import SiteStoreS3
class Watcher:
def __init__(self, sites_source_path, keywords_source_path) -> None:
def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
self.site_store = SiteStoreS3("optar-dev-cache")
self.site_reader = SiteReader()
self.keywords_source_path = keywords_source_path
@ -19,7 +19,7 @@ class Watcher:
with open(path) as f:
return f.read().splitlines()
def watch(self, sleep=-1):
def watch(self, crawler, sleep=-1):
"""start the watcher with the given interval
:param arg: seconds between runs, -1 for single run
@ -33,8 +33,7 @@ class Watcher:
for site in sites:
crawler = Crawler()
crawler.run(site, 1)
crawler.run(site)
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
contents = [self.get_new_content(site) for site in sites]
@ -47,11 +46,14 @@ class Watcher:
print(matches)
if sleep == -1:
return
return matches
time.sleep(sleep)
@staticmethod
def remove_protocol(site):
# every protocol should have //
if "//" not in site:
return site
return site.split('/')[2]
def get_new_content(self, url) -> Dict[str, str]:
@ -65,10 +67,14 @@ class Watcher:
else:
news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
if news:
sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])
sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
return sites_contents
return {}
@staticmethod
def get_added_urls( news):
return [z.split("'")[1] for z in list(news["dictionary_item_added"])]
@staticmethod
def search_sites(url, content, keywords: List[str]):
if content is None:

0
src/__init__.py Normal file
View File

0
tests/__init__.py Normal file
View File

1
tests/cache/2024-07-15_16-30-47.json vendored Normal file
View File

@ -0,0 +1 @@
{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []}

1
tests/cache/2024-07-16_16-30-47.json vendored Normal file
View File

@ -0,0 +1 @@
{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/"], [1, "https://www.patricematz.de/CV"], [1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []}

1
tests/keywords.txt Normal file
View File

@ -0,0 +1 @@
Consultant

1
tests/sites.txt Normal file
View File

@ -0,0 +1 @@
https://www.patricematz.de/

39
tests/watcher_test.py Normal file
View File

@ -0,0 +1,39 @@
from optar.src.SiteReader import SiteReader
from optar.src.Watcher import Watcher
from optar.src.SiteStore import SiteStore
def test_search_sites__found():
x = Watcher.search_sites("test.com", "dfjgbnsdigubsdofgliusdbgsdiugbTESTfjgnsdgosd\n\nsdfboiuasdgf!0980", ["TEST"])
assert x == [("test.com", "TEST")]
def test_search_sites__not_found():
x = Watcher.search_sites("test.com", "dfjgbnsdigubsdofgliusdbgsdiugbfjgnsdgosd\n\nsdfboiuasdgf!0980", ["TEST", "testing"])
assert x == []
def test_remove_protocol__https():
res = Watcher.remove_protocol("https://www.google.com")
assert res == "www.google.com"
def test_remove_protocol__http():
res = Watcher.remove_protocol("http://www.google.com")
assert res == "www.google.com"
def test_remove_protocol__none():
res = Watcher.remove_protocol("www.google.com")
assert res == "www.google.com"
def test_compare_sites():
class MockCrawler:
_links = {}
def run(self, url):
self._links[url] = [url]
def get_nodes(self):
return self._links
# the links given in this sites.txt should be to either local files, or a local mock server
# this is not implemented, as it would be trivial but time consuming
watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")
assert [] == watcher.watch(MockCrawler())