mirror of https://github.com/Askill/optar.git
commit
7ea7ca8125
|
|
@ -7,7 +7,7 @@ on:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pytest:
|
pytest:
|
||||||
name: 'Terraform'
|
name: 'Pytest'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
|
@ -31,5 +31,6 @@ jobs:
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
pip install pytest==8.2.2
|
pip install pytest==8.2.2
|
||||||
|
- name: run test
|
||||||
pytest tests/watcher_test.py --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
|
run: |
|
||||||
|
pytest tests/watcher_test.py
|
||||||
6
dev.py
6
dev.py
|
|
@ -1,6 +1,6 @@
|
||||||
from optar.src.Crawler import Crawler
|
from src.Crawler import Crawler
|
||||||
from optar.src.SiteReader import SiteReader
|
from src.SiteReader import SiteReader
|
||||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
from src.SiteStoreS3 import SiteStoreS3
|
||||||
from src.Watcher import Watcher
|
from src.Watcher import Watcher
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -1 +1 @@
|
||||||
Oktober
|
Engineer
|
||||||
6
main.py
6
main.py
|
|
@ -1,6 +1,6 @@
|
||||||
from optar.src.Crawler import Crawler
|
from src.Crawler import Crawler
|
||||||
from optar.src.SiteReader import SiteReader
|
from src.SiteReader import SiteReader
|
||||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
from src.SiteStoreS3 import SiteStoreS3
|
||||||
from src.Watcher import Watcher
|
from src.Watcher import Watcher
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
6
prod.py
6
prod.py
|
|
@ -1,6 +1,6 @@
|
||||||
from optar.src.Crawler import Crawler
|
from src.Crawler import Crawler
|
||||||
from optar.src.SiteReader import SiteReader
|
from src.SiteReader import SiteReader
|
||||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
from src.SiteStoreS3 import SiteStoreS3
|
||||||
from src.Watcher import Watcher
|
from src.Watcher import Watcher
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -1,19 +0,0 @@
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
class SiteStore:
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_site_history(cache_path) -> Optional[list[str]]:
|
|
||||||
if not os.path.isdir(cache_path):
|
|
||||||
return None
|
|
||||||
return sorted(os.listdir(cache_path))
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_site_links(path):
|
|
||||||
with open(path, 'r') as fp:
|
|
||||||
return json.load(fp)
|
|
||||||
|
|
@ -20,7 +20,7 @@ class SiteStoreS3:
|
||||||
if "Contents"not in result:
|
if "Contents"not in result:
|
||||||
return None
|
return None
|
||||||
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
|
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
|
||||||
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True)
|
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"]], reverse=True)
|
||||||
|
|
||||||
def get_site_links(self, path):
|
def get_site_links(self, path):
|
||||||
s3 = boto3.resource('s3')
|
s3 = boto3.resource('s3')
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,13 @@
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Dict, Optional
|
from typing import List, Dict
|
||||||
from deepdiff import DeepDiff
|
from deepdiff import DeepDiff
|
||||||
|
|
||||||
from optar.src.Crawler import Crawler
|
|
||||||
from optar.src.SiteReader import SiteReader
|
|
||||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
|
||||||
|
|
||||||
|
|
||||||
class Watcher:
|
class Watcher:
|
||||||
|
# there should be a type hint for site_store and site_reader, referencing interfaces, which these implement, for better auto complete and DX
|
||||||
def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
|
def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
|
||||||
self.site_store = SiteStoreS3("optar-dev-cache")
|
self.site_store = site_store
|
||||||
self.site_reader = SiteReader()
|
self.site_reader = site_reader
|
||||||
self.keywords_source_path = keywords_source_path
|
self.keywords_source_path = keywords_source_path
|
||||||
self.sites_source_path = sites_source_path
|
self.sites_source_path = sites_source_path
|
||||||
|
|
||||||
|
|
@ -35,6 +31,8 @@ class Watcher:
|
||||||
for site in sites:
|
for site in sites:
|
||||||
crawler.run(site)
|
crawler.run(site)
|
||||||
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
|
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
|
||||||
|
# do NOT overload the target
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
contents = [self.get_new_content(site) for site in sites]
|
contents = [self.get_new_content(site) for site in sites]
|
||||||
# TODO: improve handleing of None
|
# TODO: improve handleing of None
|
||||||
|
|
@ -62,10 +60,11 @@ class Watcher:
|
||||||
|
|
||||||
if len(list_of_files) >= 2:
|
if len(list_of_files) >= 2:
|
||||||
prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
|
prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
|
||||||
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
|
||||||
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
|
||||||
else:
|
else:
|
||||||
news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
prev_version = {url: []}
|
||||||
|
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||||
|
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
||||||
|
|
||||||
if news:
|
if news:
|
||||||
sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
|
sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
|
||||||
return sites_contents
|
return sites_contents
|
||||||
|
|
@ -73,7 +72,7 @@ class Watcher:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_added_urls( news):
|
def get_added_urls( news):
|
||||||
return [z.split("'")[1] for z in list(news["dictionary_item_added"])]
|
return [z.split("'")[1] for z in list(news["iterable_item_added"])]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def search_sites(url, content, keywords: List[str]):
|
def search_sites(url, content, keywords: List[str]):
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class SiteStore:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_site_history(in_path) -> Optional[list[str]]:
|
||||||
|
cache_path = "./cache/" + in_path
|
||||||
|
if not os.path.isdir(cache_path):
|
||||||
|
return []
|
||||||
|
return sorted(os.listdir(cache_path))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_site_links(in_path):
|
||||||
|
cache_path = "./cache/" + in_path
|
||||||
|
with open(cache_path, 'r') as fp:
|
||||||
|
return json.load(fp)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def persist(self, data):
|
||||||
|
return
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
|
import os
|
||||||
from optar.src.SiteReader import SiteReader
|
from optar.src.SiteReader import SiteReader
|
||||||
from optar.src.Watcher import Watcher
|
from optar.src.Watcher import Watcher
|
||||||
from optar.src.SiteStore import SiteStore
|
from optar.tests.MockSiteStore import SiteStore
|
||||||
|
|
||||||
def test_search_sites__found():
|
def test_search_sites__found():
|
||||||
|
|
||||||
|
|
@ -31,7 +32,8 @@ def test_compare_sites():
|
||||||
self._links[url] = [url]
|
self._links[url] = [url]
|
||||||
def get_nodes(self):
|
def get_nodes(self):
|
||||||
return self._links
|
return self._links
|
||||||
|
assert os.path.isdir("./cache/www.patricematz.de")
|
||||||
|
assert len(SiteStore.get_site_history("www.patricematz.de")) >= 2
|
||||||
# the links given in this sites.txt should be to either local files, or a local mock server
|
# the links given in this sites.txt should be to either local files, or a local mock server
|
||||||
# this is not implemented, as it would be trivial but time consuming
|
# this is not implemented, as it would be trivial but time consuming
|
||||||
watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")
|
watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue