Merge pull request #1 from Askill/test

test
This commit is contained in:
Patrice Matz 2024-07-17 20:34:56 +02:00 committed by GitHub
commit 7ea7ca8125
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 58 additions and 49 deletions

View File

@ -7,7 +7,7 @@ on:
jobs:
pytest:
name: 'Terraform'
name: 'Pytest'
runs-on: ubuntu-latest
strategy:
matrix:
@ -31,5 +31,6 @@ jobs:
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest==8.2.2
pytest tests/watcher_test.py --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
- name: run test
run: |
pytest tests/watcher_test.py

6
dev.py
View File

@ -1,6 +1,6 @@
from optar.src.Crawler import Crawler
from optar.src.SiteReader import SiteReader
from optar.src.SiteStoreS3 import SiteStoreS3
from src.Crawler import Crawler
from src.SiteReader import SiteReader
from src.SiteStoreS3 import SiteStoreS3
from src.Watcher import Watcher
if __name__ == "__main__":

View File

@ -1 +1 @@
Oktober
Engineer

View File

@ -1,6 +1,6 @@
from optar.src.Crawler import Crawler
from optar.src.SiteReader import SiteReader
from optar.src.SiteStoreS3 import SiteStoreS3
from src.Crawler import Crawler
from src.SiteReader import SiteReader
from src.SiteStoreS3 import SiteStoreS3
from src.Watcher import Watcher
if __name__ == "__main__":

View File

@ -1,6 +1,6 @@
from optar.src.Crawler import Crawler
from optar.src.SiteReader import SiteReader
from optar.src.SiteStoreS3 import SiteStoreS3
from src.Crawler import Crawler
from src.SiteReader import SiteReader
from src.SiteStoreS3 import SiteStoreS3
from src.Watcher import Watcher
if __name__ == "__main__":

View File

@ -2,5 +2,5 @@
This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader.
Default timeout 1h, list of keywords and sites can be changed while the software is running.
Only retrieves static content, client side rendered content crawling is not implemented.

View File

@ -1 +1 @@
https://www.patricematz.de/
https://www.patricematz.de/CV

View File

@ -1,19 +0,0 @@
import json
import os
from typing import List, Optional
class SiteStore:
def __init__(self):
pass
@staticmethod
def get_site_history(cache_path) -> Optional[list[str]]:
if not os.path.isdir(cache_path):
return None
return sorted(os.listdir(cache_path))
@staticmethod
def get_site_links(path):
with open(path, 'r') as fp:
return json.load(fp)

View File

@ -20,7 +20,7 @@ class SiteStoreS3:
if "Contents"not in result:
return None
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True)
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"]], reverse=True)
def get_site_links(self, path):
s3 = boto3.resource('s3')

View File

@ -1,17 +1,13 @@
import time
from datetime import datetime
from typing import List, Dict, Optional
from typing import List, Dict
from deepdiff import DeepDiff
from optar.src.Crawler import Crawler
from optar.src.SiteReader import SiteReader
from optar.src.SiteStoreS3 import SiteStoreS3
class Watcher:
# there should be a type hint for site_store and site_reader, referencing interfaces, which these implement, for better auto complete and DX
def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
self.site_store = SiteStoreS3("optar-dev-cache")
self.site_reader = SiteReader()
self.site_store = site_store
self.site_reader = site_reader
self.keywords_source_path = keywords_source_path
self.sites_source_path = sites_source_path
@ -35,6 +31,8 @@ class Watcher:
for site in sites:
crawler.run(site)
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
# do NOT overload the target
time.sleep(1)
contents = [self.get_new_content(site) for site in sites]
# TODO: improve handleing of None
@ -62,10 +60,11 @@ class Watcher:
if len(list_of_files) >= 2:
prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
news = DeepDiff(prev_version, current_version, ignore_order=True)
else:
news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
prev_version = {url: []}
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
news = DeepDiff(prev_version, current_version, ignore_order=True)
if news:
sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
return sites_contents
@ -73,7 +72,7 @@ class Watcher:
@staticmethod
def get_added_urls( news):
return [z.split("'")[1] for z in list(news["dictionary_item_added"])]
return [z.split("'")[1] for z in list(news["iterable_item_added"])]
@staticmethod
def search_sites(url, content, keywords: List[str]):

26
tests/MockSiteStore.py Normal file
View File

@ -0,0 +1,26 @@
import json
import os
from pathlib import Path
from typing import List, Optional
class SiteStore:
def __init__(self):
pass
@staticmethod
def get_site_history(in_path) -> Optional[list[str]]:
cache_path = "./cache/" + in_path
if not os.path.isdir(cache_path):
return []
return sorted(os.listdir(cache_path))
@staticmethod
def get_site_links(in_path):
cache_path = "./cache/" + in_path
with open(cache_path, 'r') as fp:
return json.load(fp)
@staticmethod
def persist(self, data):
return

View File

@ -1,6 +1,7 @@
import os
from optar.src.SiteReader import SiteReader
from optar.src.Watcher import Watcher
from optar.src.SiteStore import SiteStore
from optar.tests.MockSiteStore import SiteStore
def test_search_sites__found():
@ -31,7 +32,8 @@ def test_compare_sites():
self._links[url] = [url]
def get_nodes(self):
return self._links
assert os.path.isdir("./cache/www.patricematz.de")
assert len(SiteStore.get_site_history("www.patricematz.de")) >= 2
# the links given in this sites.txt should be to either local files, or a local mock server
# this is not implemented, as it would be trivial but time consuming
watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")