optar/src/Watcher.py

import time
from datetime import datetime
from typing import List, Dict, Optional

from src.Crawler import Crawler
from src.SiteReader import SiteReader
from src.SiteStore import SiteStore


class Watcher:
    def __init__(self, sites_source_path, keywords_source_path) -> None:
        self.site_store = SiteStore()
        self.site_reader = SiteReader()
        self.keywords_source_path = keywords_source_path
        self.sites_source_path = sites_source_path

    def read_txt_file(self, path):
        with open(path) as f:
            return f.read().splitlines()

    def watch(self):
        while True:
            keywords = self.read_txt_file(self.keywords_source_path)
            sites = self.read_txt_file(self.sites_source_path)

            crawler = Crawler()
            crawled_sites = []
            for site in sites:
                crawler.run(site, 10)
                crawled_sites += crawler.get_nodes()
                crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")

            contents = [self.get_new_content(site) for site in crawled_sites]
            contents = [x for x in contents if x is not None]
            matches = []
            for url, content in contents.items():
                matches.append(self.search_sites(url, content, keywords))
            print(matches)
            time.sleep(3600)

    @staticmethod
    def remove_protocol(site):
        return site.split('/')[2]

    def get_new_content(self, url) -> Optional[List[str]]:
        """ get all past iterations of a site by the fully qualified domain name """
        list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
        if not len(list_of_files) >= 2:
            return None
        prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
        current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
        news = dict(set(prev_version.items()) ^ set(current_version.items()))
        sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))

        return sites_contents

    def search_sites(self, url, content, keywords: List[str]):
        results = []
        for keyword in keywords:
            if keyword in content.values():
                results.append((url, keyword))
        return results
wip 2022-10-15 13:38:58 +00:00			`import time`
			`from datetime import datetime`
			`from typing import List, Dict, Optional`
started optar 2022-10-14 21:04:13 +00:00
wip 2022-10-15 13:38:58 +00:00			`from src.Crawler import Crawler`
started optar 2022-10-14 21:04:13 +00:00			`from src.SiteReader import SiteReader`
			`from src.SiteStore import SiteStore`


			`class Watcher:`
wip 2022-10-15 13:38:58 +00:00			`def __init__(self, sites_source_path, keywords_source_path) -> None:`
started optar 2022-10-14 21:04:13 +00:00			`self.site_store = SiteStore()`
			`self.site_reader = SiteReader()`
wip 2022-10-15 13:38:58 +00:00			`self.keywords_source_path = keywords_source_path`
			`self.sites_source_path = sites_source_path`
started optar 2022-10-14 21:04:13 +00:00
			`def read_txt_file(self, path):`
			`with open(path) as f:`
			`return f.read().splitlines()`

			`def watch(self):`
			`while True:`
			`keywords = self.read_txt_file(self.keywords_source_path)`
			`sites = self.read_txt_file(self.sites_source_path)`

wip 2022-10-15 13:38:58 +00:00			`crawler = Crawler()`
			`crawled_sites = []`
			`for site in sites:`
			`crawler.run(site, 10)`
			`crawled_sites += crawler.get_nodes()`
			`crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")`

			`contents = [self.get_new_content(site) for site in crawled_sites]`
			`contents = [x for x in contents if x is not None]`
started optar 2022-10-14 21:04:13 +00:00			`matches = []`
			`for url, content in contents.items():`
			`matches.append(self.search_sites(url, content, keywords))`
			`print(matches)`
wip 2022-10-15 13:38:58 +00:00			`time.sleep(3600)`

			`@staticmethod`
			`def remove_protocol(site):`
			`return site.split('/')[2]`

			`def get_new_content(self, url) -> Optional[List[str]]:`
started optar 2022-10-14 21:04:13 +00:00			`""" get all past iterations of a site by the fully qualified domain name """`
wip 2022-10-15 13:38:58 +00:00			`list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")`
			`if not len(list_of_files) >= 2:`
			`return None`
			`prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")`
			`current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")`
started optar 2022-10-14 21:04:13 +00:00			`news = dict(set(prev_version.items()) ^ set(current_version.items()))`
			`sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))`

			`return sites_contents`

			`def search_sites(self, url, content, keywords: List[str]):`
			`results = []`
			`for keyword in keywords:`
			`if keyword in content.values():`
			`results.append((url, keyword))`
wip 2022-10-15 13:38:58 +00:00			`return results`