optar/src/Watcher.py

81 lines
2.9 KiB
Python
Raw Normal View History

2022-10-15 13:38:58 +00:00
import time
from datetime import datetime
from typing import List, Dict, Optional
2022-10-16 12:57:55 +00:00
from deepdiff import DeepDiff
2022-10-14 21:04:13 +00:00
2022-10-15 13:38:58 +00:00
from src.Crawler import Crawler
2022-10-14 21:04:13 +00:00
from src.SiteReader import SiteReader
2024-07-15 14:12:40 +00:00
from src.SiteStoreS3 import SiteStoreS3
2022-10-14 21:04:13 +00:00
class Watcher:
2022-10-15 13:38:58 +00:00
def __init__(self, sites_source_path, keywords_source_path) -> None:
2024-07-15 14:12:40 +00:00
self.site_store = SiteStoreS3("optar-dev-cache")
2022-10-14 21:04:13 +00:00
self.site_reader = SiteReader()
2022-10-15 13:38:58 +00:00
self.keywords_source_path = keywords_source_path
self.sites_source_path = sites_source_path
2022-10-14 21:04:13 +00:00
def read_txt_file(self, path):
with open(path) as f:
return f.read().splitlines()
def watch(self, sleep=-1):
"""start the watcher with the given interval
:param arg: seconds between runs, -1 for single run
:type arg: int
:return: None
:rtype: None
"""
2022-10-14 21:04:13 +00:00
while True:
keywords = self.read_txt_file(self.keywords_source_path)
sites = self.read_txt_file(self.sites_source_path)
2022-11-06 14:18:57 +00:00
2022-10-15 13:38:58 +00:00
for site in sites:
2022-11-06 14:18:57 +00:00
crawler = Crawler()
2022-12-11 12:16:43 +00:00
crawler.run(site, 1)
2024-07-15 14:12:40 +00:00
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
2022-10-15 13:38:58 +00:00
2022-11-06 13:22:20 +00:00
contents = [self.get_new_content(site) for site in sites]
2022-11-06 14:18:57 +00:00
# TODO: improve handleing of None
2022-10-16 12:57:55 +00:00
contents = [x for x in contents if x is not None and x is not {}]
2022-10-14 21:04:13 +00:00
matches = []
2022-10-16 12:57:55 +00:00
for content in contents:
for url, c in content.items():
matches.append(self.search_sites(url, c, keywords))
2022-10-14 21:04:13 +00:00
print(matches)
if sleep == -1:
return
2022-10-15 13:52:20 +00:00
time.sleep(sleep)
2022-10-15 13:38:58 +00:00
@staticmethod
def remove_protocol(site):
return site.split('/')[2]
2022-10-16 12:57:55 +00:00
def get_new_content(self, url) -> Dict[str, str]:
2022-10-14 21:04:13 +00:00
""" get all past iterations of a site by the fully qualified domain name """
2024-07-15 14:12:40 +00:00
list_of_files = self.site_store.get_site_history(f"{self.remove_protocol(url)}/")
2022-10-16 12:57:55 +00:00
if len(list_of_files) >= 2:
2024-07-15 14:12:40 +00:00
prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
2022-10-16 12:57:55 +00:00
news = DeepDiff(prev_version, current_version, ignore_order=True)
else:
2024-07-15 14:12:40 +00:00
news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
if news:
sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])
2022-10-14 21:04:13 +00:00
return sites_contents
2022-10-16 12:57:55 +00:00
@staticmethod
def search_sites(url, content, keywords: List[str]):
2022-12-11 12:16:43 +00:00
if content is None:
return []
2022-10-14 21:04:13 +00:00
results = []
for keyword in keywords:
2022-10-16 12:57:55 +00:00
if keyword in content:
2022-10-14 21:04:13 +00:00
results.append((url, keyword))
2022-10-15 13:38:58 +00:00
return results