mirror of https://github.com/Askill/optar.git
46 lines
1.6 KiB
Python
46 lines
1.6 KiB
Python
|
|
|
||
|
|
from typing import List, Dict
|
||
|
|
|
||
|
|
from src.SiteReader import SiteReader
|
||
|
|
from src.SiteStore import SiteStore
|
||
|
|
|
||
|
|
|
||
|
|
class Watcher:
|
||
|
|
def __init__(self) -> None:
|
||
|
|
self.site_store = SiteStore()
|
||
|
|
self.site_reader = SiteReader()
|
||
|
|
self.keywords_source_path = ""
|
||
|
|
self.sites_source_path = ""
|
||
|
|
|
||
|
|
def read_txt_file(self, path):
|
||
|
|
with open(path) as f:
|
||
|
|
return f.read().splitlines()
|
||
|
|
|
||
|
|
def watch(self):
|
||
|
|
while True:
|
||
|
|
keywords = self.read_txt_file(self.keywords_source_path)
|
||
|
|
sites = self.read_txt_file(self.sites_source_path)
|
||
|
|
|
||
|
|
contents = [self.get_new_content(site) for site in sites]
|
||
|
|
keywords = [x for x in self.get_new_content(keyword) for keyword in keywords]
|
||
|
|
matches = []
|
||
|
|
for url, content in contents.items():
|
||
|
|
matches.append(self.search_sites(url, content, keywords))
|
||
|
|
print(matches)
|
||
|
|
|
||
|
|
def get_new_content(self, fqdm) -> List[str]:
|
||
|
|
""" get all past iterations of a site by the fully qualified domain name """
|
||
|
|
list_of_files = self.site_store.get_site_history(fqdm)
|
||
|
|
prev_version = list_of_files[-2]
|
||
|
|
current_version = list_of_files[-1]
|
||
|
|
news = dict(set(prev_version.items()) ^ set(current_version.items()))
|
||
|
|
sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))
|
||
|
|
|
||
|
|
return sites_contents
|
||
|
|
|
||
|
|
def search_sites(self, url, content, keywords: List[str]):
|
||
|
|
results = []
|
||
|
|
for keyword in keywords:
|
||
|
|
if keyword in content.values():
|
||
|
|
results.append((url, keyword))
|
||
|
|
return results
|