mirror of https://github.com/Askill/optar.git
refactored to utilize dependency injection to make code more testable, added some tests
This commit is contained in:
parent
19c05d4820
commit
413f070304
5
dev.py
5
dev.py
|
|
@ -1,4 +1,7 @@
|
||||||
|
from optar.src.Crawler import Crawler
|
||||||
|
from optar.src.SiteReader import SiteReader
|
||||||
|
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||||
from src.Watcher import Watcher
|
from src.Watcher import Watcher
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
Watcher("./optar/sites.txt", "./optar/keywords.txt").watch()
|
Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(),"./optar/sites.txt", "./optar/keywords.txt").watch(crawler=Crawler(1))
|
||||||
5
main.py
5
main.py
|
|
@ -1,4 +1,7 @@
|
||||||
|
from optar.src.Crawler import Crawler
|
||||||
|
from optar.src.SiteReader import SiteReader
|
||||||
|
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||||
from src.Watcher import Watcher
|
from src.Watcher import Watcher
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
Watcher("./sites.txt", "./keywords.txt").watch(3600)
|
Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(),"./sites.txt", "./keywords.txt").watch(crawler=Crawler(1), sleep=3600)
|
||||||
5
prod.py
5
prod.py
|
|
@ -1,4 +1,7 @@
|
||||||
|
from optar.src.Crawler import Crawler
|
||||||
|
from optar.src.SiteReader import SiteReader
|
||||||
|
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||||
from src.Watcher import Watcher
|
from src.Watcher import Watcher
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
Watcher("./sites.txt", "./keywords.txt").watch()
|
Watcher(SiteStoreS3("optar-dev-cache"), SiteReader(), "./sites.txt", "./keywords.txt").watch(crawler=Crawler(1))
|
||||||
|
|
@ -3,3 +3,4 @@
|
||||||
This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader.
|
This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader.
|
||||||
Default timeout 1h, list of keywords and sites can be changed while the software is running.
|
Default timeout 1h, list of keywords and sites can be changed while the software is running.
|
||||||
|
|
||||||
|
Only retrieves static content, client side rendered content crawling is not implemented.
|
||||||
|
|
|
||||||
|
|
@ -4,3 +4,4 @@ requests==2.32.3
|
||||||
trafilatura==1.11.0
|
trafilatura==1.11.0
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
boto3==1.34.144
|
boto3==1.34.144
|
||||||
|
pytest==8.2.2
|
||||||
|
|
@ -8,7 +8,7 @@ from pathlib import Path
|
||||||
|
|
||||||
class Crawler:
|
class Crawler:
|
||||||
url = "" # the url of the website to be checked
|
url = "" # the url of the website to be checked
|
||||||
_links = dict() # dic. with all sites and urls on those sites
|
_links = dict() # dict with all sites and urls on those sites
|
||||||
header_values = {
|
header_values = {
|
||||||
'Connection:': 'Keep-alive',
|
'Connection:': 'Keep-alive',
|
||||||
'name': 'Michael Foord',
|
'name': 'Michael Foord',
|
||||||
|
|
@ -19,15 +19,16 @@ class Crawler:
|
||||||
exclude = [
|
exclude = [
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, logger=None, exclude=None):
|
def __init__(self, depth=1, logger=None, exclude=None):
|
||||||
if exclude:
|
if exclude:
|
||||||
self.exclude += exclude
|
self.exclude += exclude
|
||||||
if logger:
|
if logger:
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
else:
|
else:
|
||||||
self.logger = logging.Logger(
|
self.logger = logging.Logger(
|
||||||
name="star_crawler", level=logging.INFO)
|
name="optar", level=logging.INFO)
|
||||||
self._links = dict()
|
self._links = dict()
|
||||||
|
self._depth = depth
|
||||||
|
|
||||||
def get_nodes(self):
|
def get_nodes(self):
|
||||||
return self._links
|
return self._links
|
||||||
|
|
@ -41,7 +42,7 @@ class Crawler:
|
||||||
with open(path, 'r') as fp:
|
with open(path, 'r') as fp:
|
||||||
self._links = json.load(fp)
|
self._links = json.load(fp)
|
||||||
|
|
||||||
def run(self, root, limit, sleep_time=0):
|
def run(self, root, sleep_time=0):
|
||||||
self.url = root
|
self.url = root
|
||||||
unchecked = [(0, root)]
|
unchecked = [(0, root)]
|
||||||
|
|
||||||
|
|
@ -72,7 +73,7 @@ class Crawler:
|
||||||
|
|
||||||
n_links = []
|
n_links = []
|
||||||
for link in _links:
|
for link in _links:
|
||||||
if link not in n_links and level < limit:
|
if link not in n_links and level < self._depth:
|
||||||
if link.startswith("http"):
|
if link.startswith("http"):
|
||||||
n_links.append((level+1, link))
|
n_links.append((level+1, link))
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ class SiteStoreS3:
|
||||||
if "Contents"not in result:
|
if "Contents"not in result:
|
||||||
return None
|
return None
|
||||||
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
|
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
|
||||||
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]])
|
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True)
|
||||||
|
|
||||||
def get_site_links(self, path):
|
def get_site_links(self, path):
|
||||||
s3 = boto3.resource('s3')
|
s3 = boto3.resource('s3')
|
||||||
|
|
|
||||||
|
|
@ -3,13 +3,13 @@ from datetime import datetime
|
||||||
from typing import List, Dict, Optional
|
from typing import List, Dict, Optional
|
||||||
from deepdiff import DeepDiff
|
from deepdiff import DeepDiff
|
||||||
|
|
||||||
from src.Crawler import Crawler
|
from optar.src.Crawler import Crawler
|
||||||
from src.SiteReader import SiteReader
|
from optar.src.SiteReader import SiteReader
|
||||||
from src.SiteStoreS3 import SiteStoreS3
|
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||||
|
|
||||||
|
|
||||||
class Watcher:
|
class Watcher:
|
||||||
def __init__(self, sites_source_path, keywords_source_path) -> None:
|
def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
|
||||||
self.site_store = SiteStoreS3("optar-dev-cache")
|
self.site_store = SiteStoreS3("optar-dev-cache")
|
||||||
self.site_reader = SiteReader()
|
self.site_reader = SiteReader()
|
||||||
self.keywords_source_path = keywords_source_path
|
self.keywords_source_path = keywords_source_path
|
||||||
|
|
@ -19,7 +19,7 @@ class Watcher:
|
||||||
with open(path) as f:
|
with open(path) as f:
|
||||||
return f.read().splitlines()
|
return f.read().splitlines()
|
||||||
|
|
||||||
def watch(self, sleep=-1):
|
def watch(self, crawler, sleep=-1):
|
||||||
"""start the watcher with the given interval
|
"""start the watcher with the given interval
|
||||||
|
|
||||||
:param arg: seconds between runs, -1 for single run
|
:param arg: seconds between runs, -1 for single run
|
||||||
|
|
@ -33,8 +33,7 @@ class Watcher:
|
||||||
|
|
||||||
|
|
||||||
for site in sites:
|
for site in sites:
|
||||||
crawler = Crawler()
|
crawler.run(site)
|
||||||
crawler.run(site, 1)
|
|
||||||
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
|
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
|
||||||
|
|
||||||
contents = [self.get_new_content(site) for site in sites]
|
contents = [self.get_new_content(site) for site in sites]
|
||||||
|
|
@ -47,11 +46,14 @@ class Watcher:
|
||||||
print(matches)
|
print(matches)
|
||||||
|
|
||||||
if sleep == -1:
|
if sleep == -1:
|
||||||
return
|
return matches
|
||||||
time.sleep(sleep)
|
time.sleep(sleep)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def remove_protocol(site):
|
def remove_protocol(site):
|
||||||
|
# every protocol should have //
|
||||||
|
if "//" not in site:
|
||||||
|
return site
|
||||||
return site.split('/')[2]
|
return site.split('/')[2]
|
||||||
|
|
||||||
def get_new_content(self, url) -> Dict[str, str]:
|
def get_new_content(self, url) -> Dict[str, str]:
|
||||||
|
|
@ -65,10 +67,14 @@ class Watcher:
|
||||||
else:
|
else:
|
||||||
news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||||
if news:
|
if news:
|
||||||
sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])
|
sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
|
||||||
return sites_contents
|
return sites_contents
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_added_urls( news):
|
||||||
|
return [z.split("'")[1] for z in list(news["dictionary_item_added"])]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def search_sites(url, content, keywords: List[str]):
|
def search_sites(url, content, keywords: List[str]):
|
||||||
if content is None:
|
if content is None:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []}
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
{"https://www.patricematz.de/": [[1, "https://www.patricematz.de/"], [1, "https://www.patricematz.de/CV"], [1, "https://www.patricematz.de/Projects"], [1, "https://www.patricematz.de/Links"]], "https://www.patricematz.de/Links": [], "https://www.patricematz.de/Projects": [], "https://www.patricematz.de/CV": []}
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
Consultant
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
https://www.patricematz.de/
|
||||||
|
|
@ -0,0 +1,39 @@
|
||||||
|
from optar.src.SiteReader import SiteReader
|
||||||
|
from optar.src.Watcher import Watcher
|
||||||
|
from optar.src.SiteStore import SiteStore
|
||||||
|
|
||||||
|
def test_search_sites__found():
|
||||||
|
|
||||||
|
x = Watcher.search_sites("test.com", "dfjgbnsdigubsdofgliusdbgsdiugbTESTfjgnsdgosd\n\nsdfboiuasdgf!0980", ["TEST"])
|
||||||
|
assert x == [("test.com", "TEST")]
|
||||||
|
|
||||||
|
def test_search_sites__not_found():
|
||||||
|
|
||||||
|
x = Watcher.search_sites("test.com", "dfjgbnsdigubsdofgliusdbgsdiugbfjgnsdgosd\n\nsdfboiuasdgf!0980", ["TEST", "testing"])
|
||||||
|
assert x == []
|
||||||
|
|
||||||
|
def test_remove_protocol__https():
|
||||||
|
res = Watcher.remove_protocol("https://www.google.com")
|
||||||
|
assert res == "www.google.com"
|
||||||
|
|
||||||
|
def test_remove_protocol__http():
|
||||||
|
res = Watcher.remove_protocol("http://www.google.com")
|
||||||
|
assert res == "www.google.com"
|
||||||
|
|
||||||
|
def test_remove_protocol__none():
|
||||||
|
res = Watcher.remove_protocol("www.google.com")
|
||||||
|
assert res == "www.google.com"
|
||||||
|
|
||||||
|
def test_compare_sites():
|
||||||
|
class MockCrawler:
|
||||||
|
_links = {}
|
||||||
|
def run(self, url):
|
||||||
|
self._links[url] = [url]
|
||||||
|
def get_nodes(self):
|
||||||
|
return self._links
|
||||||
|
|
||||||
|
# the links given in this sites.txt should be to either local files, or a local mock server
|
||||||
|
# this is not implemented, as it would be trivial but time consuming
|
||||||
|
watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")
|
||||||
|
assert [] == watcher.watch(MockCrawler())
|
||||||
|
|
||||||
Loading…
Reference in New Issue