mirror of https://github.com/Askill/optar.git
fixed tests
This commit is contained in:
parent
83c3e3b54a
commit
aa37a25f1c
6
dev.py
6
dev.py
|
|
@ -1,6 +1,6 @@
|
|||
from optar.src.Crawler import Crawler
|
||||
from optar.src.SiteReader import SiteReader
|
||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Crawler import Crawler
|
||||
from src.SiteReader import SiteReader
|
||||
from src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Watcher import Watcher
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
Oktober
|
||||
Engineer
|
||||
6
main.py
6
main.py
|
|
@ -1,6 +1,6 @@
|
|||
from optar.src.Crawler import Crawler
|
||||
from optar.src.SiteReader import SiteReader
|
||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Crawler import Crawler
|
||||
from src.SiteReader import SiteReader
|
||||
from src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Watcher import Watcher
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
6
prod.py
6
prod.py
|
|
@ -1,6 +1,6 @@
|
|||
from optar.src.Crawler import Crawler
|
||||
from optar.src.SiteReader import SiteReader
|
||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Crawler import Crawler
|
||||
from src.SiteReader import SiteReader
|
||||
from src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Watcher import Watcher
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1,19 +0,0 @@
|
|||
import json
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class SiteStore:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def get_site_history(cache_path) -> Optional[list[str]]:
|
||||
if not os.path.isdir(cache_path):
|
||||
return None
|
||||
return sorted(os.listdir(cache_path))
|
||||
|
||||
@staticmethod
|
||||
def get_site_links(path):
|
||||
with open(path, 'r') as fp:
|
||||
return json.load(fp)
|
||||
|
|
@ -20,7 +20,7 @@ class SiteStoreS3:
|
|||
if "Contents"not in result:
|
||||
return None
|
||||
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
|
||||
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True)
|
||||
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"]], reverse=True)
|
||||
|
||||
def get_site_links(self, path):
|
||||
s3 = boto3.resource('s3')
|
||||
|
|
|
|||
|
|
@ -1,17 +1,13 @@
|
|||
import time
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional
|
||||
from typing import List, Dict
|
||||
from deepdiff import DeepDiff
|
||||
|
||||
from optar.src.Crawler import Crawler
|
||||
from optar.src.SiteReader import SiteReader
|
||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||
|
||||
|
||||
class Watcher:
|
||||
# there should be a type hint for site_store and site_reader, referencing interfaces, which these implement, for better auto complete and DX
|
||||
def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
|
||||
self.site_store = SiteStoreS3("optar-dev-cache")
|
||||
self.site_reader = SiteReader()
|
||||
self.site_store = site_store
|
||||
self.site_reader = site_reader
|
||||
self.keywords_source_path = keywords_source_path
|
||||
self.sites_source_path = sites_source_path
|
||||
|
||||
|
|
@ -35,6 +31,8 @@ class Watcher:
|
|||
for site in sites:
|
||||
crawler.run(site)
|
||||
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
|
||||
# do NOT overload the target
|
||||
time.sleep(1)
|
||||
|
||||
contents = [self.get_new_content(site) for site in sites]
|
||||
# TODO: improve handleing of None
|
||||
|
|
@ -62,10 +60,11 @@ class Watcher:
|
|||
|
||||
if len(list_of_files) >= 2:
|
||||
prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
|
||||
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
||||
else:
|
||||
news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||
prev_version = {url: []}
|
||||
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
||||
|
||||
if news:
|
||||
sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
|
||||
return sites_contents
|
||||
|
|
@ -73,7 +72,7 @@ class Watcher:
|
|||
|
||||
@staticmethod
|
||||
def get_added_urls( news):
|
||||
return [z.split("'")[1] for z in list(news["dictionary_item_added"])]
|
||||
return [z.split("'")[1] for z in list(news["iterable_item_added"])]
|
||||
|
||||
@staticmethod
|
||||
def search_sites(url, content, keywords: List[str]):
|
||||
|
|
|
|||
|
|
@ -0,0 +1,26 @@
|
|||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class SiteStore:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def get_site_history(in_path) -> Optional[list[str]]:
|
||||
cache_path = "./cache/" + in_path
|
||||
if not os.path.isdir(cache_path):
|
||||
return []
|
||||
return sorted(os.listdir(cache_path))
|
||||
|
||||
@staticmethod
|
||||
def get_site_links(in_path):
|
||||
cache_path = "./cache/" + in_path
|
||||
with open(cache_path, 'r') as fp:
|
||||
return json.load(fp)
|
||||
|
||||
@staticmethod
|
||||
def persist(self, data):
|
||||
return
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
from optar.src.SiteReader import SiteReader
|
||||
from optar.src.Watcher import Watcher
|
||||
from optar.src.SiteStore import SiteStore
|
||||
from optar.tests.MockSiteStore import SiteStore
|
||||
|
||||
def test_search_sites__found():
|
||||
|
||||
|
|
@ -31,7 +32,8 @@ def test_compare_sites():
|
|||
self._links[url] = [url]
|
||||
def get_nodes(self):
|
||||
return self._links
|
||||
|
||||
assert os.path.isdir("./cache/www.patricematz.de")
|
||||
assert len(SiteStore.get_site_history("www.patricematz.de")) >= 2
|
||||
# the links given in this sites.txt should be to either local files, or a local mock server
|
||||
# this is not implemented, as it would be trivial but time consuming
|
||||
watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")
|
||||
|
|
|
|||
Loading…
Reference in New Issue