optar/src/Watcher.py

import time
from datetime import datetime
from typing import List, Dict, Optional
from deepdiff import DeepDiff

from src.Crawler import Crawler
from src.SiteReader import SiteReader
from src.SiteStoreS3 import SiteStoreS3


class Watcher:
    def __init__(self, sites_source_path, keywords_source_path) -> None:
        self.site_store = SiteStoreS3("optar-dev-cache")
        self.site_reader = SiteReader()
        self.keywords_source_path = keywords_source_path
        self.sites_source_path = sites_source_path

    def read_txt_file(self, path):
        with open(path) as f:
            return f.read().splitlines()

    def watch(self, sleep=-1):
        """start the watcher with the given interval

        :param arg: seconds between runs, -1 for single run
        :type arg: int
        :return: None
        :rtype: None	
        """
        while True:
            keywords = self.read_txt_file(self.keywords_source_path)
            sites = self.read_txt_file(self.sites_source_path)

            
            for site in sites:
                crawler = Crawler()
                crawler.run(site, 1)
                self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())

            contents = [self.get_new_content(site) for site in sites]
            # TODO: improve handleing of None
            contents = [x for x in contents if x is not None and x is not {}]
            matches = []
            for content in contents:
                for url, c in content.items():
                    matches.append(self.search_sites(url, c, keywords))
            print(matches)
            
            if sleep == -1:
                return
            time.sleep(sleep)

    @staticmethod
    def remove_protocol(site):
        return site.split('/')[2]

    def get_new_content(self, url) -> Dict[str, str]:
        """ get all past iterations of a site by the fully qualified domain name """
        list_of_files = self.site_store.get_site_history(f"{self.remove_protocol(url)}/")

        if len(list_of_files) >= 2:
            prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
            current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
            news = DeepDiff(prev_version, current_version, ignore_order=True)
        else:
            news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
        if news:
            sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])

        return sites_contents

    @staticmethod
    def search_sites(url, content, keywords: List[str]):
        if content is None:
            return []
        results = []
        for keyword in keywords:
            if keyword in content:
                results.append((url, keyword))
        return results
wip 2022-10-15 13:38:58 +00:00			`import time`
			`from datetime import datetime`
			`from typing import List, Dict, Optional`
working, but duplicate results 2022-10-16 12:57:55 +00:00			`from deepdiff import DeepDiff`
started optar 2022-10-14 21:04:13 +00:00
wip 2022-10-15 13:38:58 +00:00			`from src.Crawler import Crawler`
started optar 2022-10-14 21:04:13 +00:00			`from src.SiteReader import SiteReader`
added S3 support 2024-07-15 14:12:40 +00:00			`from src.SiteStoreS3 import SiteStoreS3`
started optar 2022-10-14 21:04:13 +00:00

			`class Watcher:`
wip 2022-10-15 13:38:58 +00:00			`def __init__(self, sites_source_path, keywords_source_path) -> None:`
added S3 support 2024-07-15 14:12:40 +00:00			`self.site_store = SiteStoreS3("optar-dev-cache")`
started optar 2022-10-14 21:04:13 +00:00			`self.site_reader = SiteReader()`
wip 2022-10-15 13:38:58 +00:00			`self.keywords_source_path = keywords_source_path`
			`self.sites_source_path = sites_source_path`
started optar 2022-10-14 21:04:13 +00:00
			`def read_txt_file(self, path):`
			`with open(path) as f:`
			`return f.read().splitlines()`

added doc string to watch(); added requirements.txt 2024-07-13 12:44:38 +00:00			`def watch(self, sleep=-1):`
			`"""start the watcher with the given interval`

			`:param arg: seconds between runs, -1 for single run`
			`:type arg: int`
			`:return: None`
			`:rtype: None`
			`"""`
started optar 2022-10-14 21:04:13 +00:00			`while True:`
			`keywords = self.read_txt_file(self.keywords_source_path)`
			`sites = self.read_txt_file(self.sites_source_path)`

todos 2022-11-06 14:18:57 +00:00
wip 2022-10-15 13:38:58 +00:00			`for site in sites:`
todos 2022-11-06 14:18:57 +00:00			`crawler = Crawler()`
added depth limit for crawler 2022-12-11 12:16:43 +00:00			`crawler.run(site, 1)`
added S3 support 2024-07-15 14:12:40 +00:00			`self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())`
wip 2022-10-15 13:38:58 +00:00
fixed duplication 2022-11-06 13:22:20 +00:00			`contents = [self.get_new_content(site) for site in sites]`
todos 2022-11-06 14:18:57 +00:00			`# TODO: improve handleing of None`
working, but duplicate results 2022-10-16 12:57:55 +00:00			`contents = [x for x in contents if x is not None and x is not {}]`
started optar 2022-10-14 21:04:13 +00:00			`matches = []`
working, but duplicate results 2022-10-16 12:57:55 +00:00			`for content in contents:`
			`for url, c in content.items():`
			`matches.append(self.search_sites(url, c, keywords))`
started optar 2022-10-14 21:04:13 +00:00			`print(matches)`
added doc string to watch(); added requirements.txt 2024-07-13 12:44:38 +00:00
			`if sleep == -1:`
			`return`
sleep timer in main 2022-10-15 13:52:20 +00:00			`time.sleep(sleep)`
wip 2022-10-15 13:38:58 +00:00
			`@staticmethod`
			`def remove_protocol(site):`
			`return site.split('/')[2]`

working, but duplicate results 2022-10-16 12:57:55 +00:00			`def get_new_content(self, url) -> Dict[str, str]:`
started optar 2022-10-14 21:04:13 +00:00			`""" get all past iterations of a site by the fully qualified domain name """`
added S3 support 2024-07-15 14:12:40 +00:00			`list_of_files = self.site_store.get_site_history(f"{self.remove_protocol(url)}/")`
working, but duplicate results 2022-10-16 12:57:55 +00:00
			`if len(list_of_files) >= 2:`
added S3 support 2024-07-15 14:12:40 +00:00			`prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")`
			`current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")`
working, but duplicate results 2022-10-16 12:57:55 +00:00			`news = DeepDiff(prev_version, current_version, ignore_order=True)`
			`else:`
added S3 support 2024-07-15 14:12:40 +00:00			`news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")`
			`if news:`
			`sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])`
started optar 2022-10-14 21:04:13 +00:00
			`return sites_contents`

working, but duplicate results 2022-10-16 12:57:55 +00:00			`@staticmethod`
			`def search_sites(url, content, keywords: List[str]):`
added depth limit for crawler 2022-12-11 12:16:43 +00:00			`if content is None:`
			`return []`
started optar 2022-10-14 21:04:13 +00:00			`results = []`
			`for keyword in keywords:`
working, but duplicate results 2022-10-16 12:57:55 +00:00			`if keyword in content:`
started optar 2022-10-14 21:04:13 +00:00			`results.append((url, keyword))`
wip 2022-10-15 13:38:58 +00:00			`return results`