mirror of https://github.com/Askill/optar.git
commit
7ea7ca8125
|
|
@ -7,7 +7,7 @@ on:
|
|||
|
||||
jobs:
|
||||
pytest:
|
||||
name: 'Terraform'
|
||||
name: 'Pytest'
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
|
|
@ -31,5 +31,6 @@ jobs:
|
|||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
pip install pytest==8.2.2
|
||||
|
||||
pytest tests/watcher_test.py --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
|
||||
- name: run test
|
||||
run: |
|
||||
pytest tests/watcher_test.py
|
||||
6
dev.py
6
dev.py
|
|
@ -1,6 +1,6 @@
|
|||
from optar.src.Crawler import Crawler
|
||||
from optar.src.SiteReader import SiteReader
|
||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Crawler import Crawler
|
||||
from src.SiteReader import SiteReader
|
||||
from src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Watcher import Watcher
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
Oktober
|
||||
Engineer
|
||||
6
main.py
6
main.py
|
|
@ -1,6 +1,6 @@
|
|||
from optar.src.Crawler import Crawler
|
||||
from optar.src.SiteReader import SiteReader
|
||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Crawler import Crawler
|
||||
from src.SiteReader import SiteReader
|
||||
from src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Watcher import Watcher
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
6
prod.py
6
prod.py
|
|
@ -1,6 +1,6 @@
|
|||
from optar.src.Crawler import Crawler
|
||||
from optar.src.SiteReader import SiteReader
|
||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Crawler import Crawler
|
||||
from src.SiteReader import SiteReader
|
||||
from src.SiteStoreS3 import SiteStoreS3
|
||||
from src.Watcher import Watcher
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -2,5 +2,5 @@
|
|||
|
||||
This tool crawles all pages on a given website to the provided deapth and finds new pages by comparing the new site tree to the cached one. All new pages are then checked for containing any of the provided keywords. If there is a match the page will be higlighted for the reader.
|
||||
Default timeout 1h, list of keywords and sites can be changed while the software is running.
|
||||
|
||||
|
||||
Only retrieves static content, client side rendered content crawling is not implemented.
|
||||
|
|
|
|||
|
|
@ -1,19 +0,0 @@
|
|||
import json
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class SiteStore:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def get_site_history(cache_path) -> Optional[list[str]]:
|
||||
if not os.path.isdir(cache_path):
|
||||
return None
|
||||
return sorted(os.listdir(cache_path))
|
||||
|
||||
@staticmethod
|
||||
def get_site_links(path):
|
||||
with open(path, 'r') as fp:
|
||||
return json.load(fp)
|
||||
|
|
@ -20,7 +20,7 @@ class SiteStoreS3:
|
|||
if "Contents"not in result:
|
||||
return None
|
||||
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
|
||||
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]], reverse=True)
|
||||
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"]], reverse=True)
|
||||
|
||||
def get_site_links(self, path):
|
||||
s3 = boto3.resource('s3')
|
||||
|
|
|
|||
|
|
@ -1,17 +1,13 @@
|
|||
import time
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional
|
||||
from typing import List, Dict
|
||||
from deepdiff import DeepDiff
|
||||
|
||||
from optar.src.Crawler import Crawler
|
||||
from optar.src.SiteReader import SiteReader
|
||||
from optar.src.SiteStoreS3 import SiteStoreS3
|
||||
|
||||
|
||||
class Watcher:
|
||||
# there should be a type hint for site_store and site_reader, referencing interfaces, which these implement, for better auto complete and DX
|
||||
def __init__(self, site_store, site_reader, sites_source_path, keywords_source_path) -> None:
|
||||
self.site_store = SiteStoreS3("optar-dev-cache")
|
||||
self.site_reader = SiteReader()
|
||||
self.site_store = site_store
|
||||
self.site_reader = site_reader
|
||||
self.keywords_source_path = keywords_source_path
|
||||
self.sites_source_path = sites_source_path
|
||||
|
||||
|
|
@ -35,6 +31,8 @@ class Watcher:
|
|||
for site in sites:
|
||||
crawler.run(site)
|
||||
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
|
||||
# do NOT overload the target
|
||||
time.sleep(1)
|
||||
|
||||
contents = [self.get_new_content(site) for site in sites]
|
||||
# TODO: improve handleing of None
|
||||
|
|
@ -62,10 +60,11 @@ class Watcher:
|
|||
|
||||
if len(list_of_files) >= 2:
|
||||
prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
|
||||
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
||||
else:
|
||||
news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||
prev_version = {url: []}
|
||||
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
||||
|
||||
if news:
|
||||
sites_contents = self.site_reader.get_sites_content_static(self.get_added_urls(news))
|
||||
return sites_contents
|
||||
|
|
@ -73,7 +72,7 @@ class Watcher:
|
|||
|
||||
@staticmethod
|
||||
def get_added_urls( news):
|
||||
return [z.split("'")[1] for z in list(news["dictionary_item_added"])]
|
||||
return [z.split("'")[1] for z in list(news["iterable_item_added"])]
|
||||
|
||||
@staticmethod
|
||||
def search_sites(url, content, keywords: List[str]):
|
||||
|
|
|
|||
|
|
@ -0,0 +1,26 @@
|
|||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class SiteStore:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def get_site_history(in_path) -> Optional[list[str]]:
|
||||
cache_path = "./cache/" + in_path
|
||||
if not os.path.isdir(cache_path):
|
||||
return []
|
||||
return sorted(os.listdir(cache_path))
|
||||
|
||||
@staticmethod
|
||||
def get_site_links(in_path):
|
||||
cache_path = "./cache/" + in_path
|
||||
with open(cache_path, 'r') as fp:
|
||||
return json.load(fp)
|
||||
|
||||
@staticmethod
|
||||
def persist(self, data):
|
||||
return
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
from optar.src.SiteReader import SiteReader
|
||||
from optar.src.Watcher import Watcher
|
||||
from optar.src.SiteStore import SiteStore
|
||||
from optar.tests.MockSiteStore import SiteStore
|
||||
|
||||
def test_search_sites__found():
|
||||
|
||||
|
|
@ -31,7 +32,8 @@ def test_compare_sites():
|
|||
self._links[url] = [url]
|
||||
def get_nodes(self):
|
||||
return self._links
|
||||
|
||||
assert os.path.isdir("./cache/www.patricematz.de")
|
||||
assert len(SiteStore.get_site_history("www.patricematz.de")) >= 2
|
||||
# the links given in this sites.txt should be to either local files, or a local mock server
|
||||
# this is not implemented, as it would be trivial but time consuming
|
||||
watcher = Watcher(SiteStore(), SiteReader(), "./sites.txt", "keywords.txt")
|
||||
|
|
|
|||
Loading…
Reference in New Issue