optar/src/Watcher.py

import time
from datetime import datetime
from typing import List, Dict, Optional
from deepdiff import DeepDiff

from src.Crawler import Crawler
from src.SiteReader import SiteReader
from src.SiteStore import SiteStore


class Watcher:
    def __init__(self, sites_source_path, keywords_source_path) -> None:
        self.site_store = SiteStore()
        self.site_reader = SiteReader()
        self.keywords_source_path = keywords_source_path
        self.sites_source_path = sites_source_path

    def read_txt_file(self, path):
        with open(path) as f:
            return f.read().splitlines()

    def watch(self, sleep):
        while True:
            keywords = self.read_txt_file(self.keywords_source_path)
            sites = self.read_txt_file(self.sites_source_path)


            for site in sites:
                crawler = Crawler()
                crawler.run(site, 1)
                crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")

            contents = [self.get_new_content(site) for site in sites]
            # TODO: improve handleing of None
            contents = [x for x in contents if x is not None and x is not {}]
            matches = []
            for content in contents:
                for url, c in content.items():
                    matches.append(self.search_sites(url, c, keywords))
            print(matches)
            time.sleep(sleep)

    @staticmethod
    def remove_protocol(site):
        return site.split('/')[2]

    def get_new_content(self, url) -> Dict[str, str]:
        """ get all past iterations of a site by the fully qualified domain name """
        list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")

        if len(list_of_files) >= 2:
            prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
            current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
            news = DeepDiff(prev_version, current_version, ignore_order=True)
        else:
            news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")

        sites_contents = self.site_reader.get_sites_content_static(list(news.keys()))

        return sites_contents

    @staticmethod
    def search_sites(url, content, keywords: List[str]):
        if content is None:
            return []
        results = []
        for keyword in keywords:
            if keyword in content:
                results.append((url, keyword))
        return results