2022-10-15 13:38:58 +00:00
|
|
|
import time
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from typing import List, Dict, Optional
|
2022-10-16 12:57:55 +00:00
|
|
|
from deepdiff import DeepDiff
|
2022-10-14 21:04:13 +00:00
|
|
|
|
2022-10-15 13:38:58 +00:00
|
|
|
from src.Crawler import Crawler
|
2022-10-14 21:04:13 +00:00
|
|
|
from src.SiteReader import SiteReader
|
|
|
|
|
from src.SiteStore import SiteStore
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Watcher:
|
2022-10-15 13:38:58 +00:00
|
|
|
def __init__(self, sites_source_path, keywords_source_path) -> None:
|
2022-10-14 21:04:13 +00:00
|
|
|
self.site_store = SiteStore()
|
|
|
|
|
self.site_reader = SiteReader()
|
2022-10-15 13:38:58 +00:00
|
|
|
self.keywords_source_path = keywords_source_path
|
|
|
|
|
self.sites_source_path = sites_source_path
|
2022-10-14 21:04:13 +00:00
|
|
|
|
|
|
|
|
def read_txt_file(self, path):
|
|
|
|
|
with open(path) as f:
|
|
|
|
|
return f.read().splitlines()
|
|
|
|
|
|
2022-10-15 13:52:20 +00:00
|
|
|
def watch(self, sleep):
|
2022-10-14 21:04:13 +00:00
|
|
|
while True:
|
|
|
|
|
keywords = self.read_txt_file(self.keywords_source_path)
|
|
|
|
|
sites = self.read_txt_file(self.sites_source_path)
|
|
|
|
|
|
2022-11-06 14:18:57 +00:00
|
|
|
|
2022-10-15 13:38:58 +00:00
|
|
|
for site in sites:
|
2022-11-06 14:18:57 +00:00
|
|
|
crawler = Crawler()
|
2022-12-11 12:16:43 +00:00
|
|
|
crawler.run(site, 1)
|
2022-10-15 13:38:58 +00:00
|
|
|
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
|
|
|
|
|
|
2022-11-06 13:22:20 +00:00
|
|
|
contents = [self.get_new_content(site) for site in sites]
|
2022-11-06 14:18:57 +00:00
|
|
|
# TODO: improve handleing of None
|
2022-10-16 12:57:55 +00:00
|
|
|
contents = [x for x in contents if x is not None and x is not {}]
|
2022-10-14 21:04:13 +00:00
|
|
|
matches = []
|
2022-10-16 12:57:55 +00:00
|
|
|
for content in contents:
|
|
|
|
|
for url, c in content.items():
|
|
|
|
|
matches.append(self.search_sites(url, c, keywords))
|
2022-10-14 21:04:13 +00:00
|
|
|
print(matches)
|
2022-10-15 13:52:20 +00:00
|
|
|
time.sleep(sleep)
|
2022-10-15 13:38:58 +00:00
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def remove_protocol(site):
|
|
|
|
|
return site.split('/')[2]
|
|
|
|
|
|
2022-10-16 12:57:55 +00:00
|
|
|
def get_new_content(self, url) -> Dict[str, str]:
|
2022-10-14 21:04:13 +00:00
|
|
|
""" get all past iterations of a site by the fully qualified domain name """
|
2022-10-15 13:38:58 +00:00
|
|
|
list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
|
2022-10-16 12:57:55 +00:00
|
|
|
|
|
|
|
|
if len(list_of_files) >= 2:
|
|
|
|
|
prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
|
|
|
|
|
current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
|
|
|
|
|
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
|
|
|
|
else:
|
|
|
|
|
news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
|
|
|
|
|
|
|
|
|
|
sites_contents = self.site_reader.get_sites_content_static(list(news.keys()))
|
2022-10-14 21:04:13 +00:00
|
|
|
|
|
|
|
|
return sites_contents
|
|
|
|
|
|
2022-10-16 12:57:55 +00:00
|
|
|
@staticmethod
|
|
|
|
|
def search_sites(url, content, keywords: List[str]):
|
2022-12-11 12:16:43 +00:00
|
|
|
if content is None:
|
|
|
|
|
return []
|
2022-10-14 21:04:13 +00:00
|
|
|
results = []
|
|
|
|
|
for keyword in keywords:
|
2022-10-16 12:57:55 +00:00
|
|
|
if keyword in content:
|
2022-10-14 21:04:13 +00:00
|
|
|
results.append((url, keyword))
|
2022-10-15 13:38:58 +00:00
|
|
|
return results
|