From 7202c0fe3eacaf7ddd58fccc5182c70180043c9a Mon Sep 17 00:00:00 2001 From: Askill Date: Sun, 16 Oct 2022 14:57:55 +0200 Subject: [PATCH] working, but duplicate results --- src/SiteReader.py | 4 +-- src/Watcher.py | 30 +++++++++++-------- .../2022-10-15_15-40-54.json | 25 ---------------- 3 files changed, 20 insertions(+), 39 deletions(-) delete mode 100644 src/cache/www.patricematz.de/2022-10-15_15-40-54.json diff --git a/src/SiteReader.py b/src/SiteReader.py index 6af81bb..24f03dc 100644 --- a/src/SiteReader.py +++ b/src/SiteReader.py @@ -51,10 +51,10 @@ class SiteReader: downloaded_url = trafilatura.fetch_url(url) try: - a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True, include_comments=False, + a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False, date_extraction_params={'extensive_search': True, 'original_date': True}) except AttributeError: - a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True, + a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, date_extraction_params={'extensive_search': True, 'original_date': True}) if a: json_output = json.loads(a) diff --git a/src/Watcher.py b/src/Watcher.py index 4b0f7fc..69690a9 100644 --- a/src/Watcher.py +++ b/src/Watcher.py @@ -1,6 +1,7 @@ import time from datetime import datetime from typing import List, Dict, Optional +from deepdiff import DeepDiff from src.Crawler import Crawler from src.SiteReader import SiteReader @@ -31,10 +32,11 @@ class Watcher: crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json") contents = [self.get_new_content(site) for site in crawled_sites] - contents = [x for x in contents if x is not None] + contents = [x for x in contents if x is not None and x is not {}] matches = [] - for url, content in contents.items(): - matches.append(self.search_sites(url, content, keywords)) + for content in contents: + for url, c in content.items(): + matches.append(self.search_sites(url, c, keywords)) print(matches) time.sleep(sleep) @@ -42,21 +44,25 @@ class Watcher: def remove_protocol(site): return site.split('/')[2] - def get_new_content(self, url) -> Optional[List[str]]: + def get_new_content(self, url) -> Dict[str, str]: """ get all past iterations of a site by the fully qualified domain name """ list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/") - if not len(list_of_files) >= 2: - return None - prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}") - current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}") - news = dict(set(prev_version.items()) ^ set(current_version.items())) - sites_contents = self.site_reader.get_sites_content_static(sum(news.items())) + + if len(list_of_files) >= 2: + prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}") + current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}") + news = DeepDiff(prev_version, current_version, ignore_order=True) + else: + news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}") + + sites_contents = self.site_reader.get_sites_content_static(list(news.keys())) return sites_contents - def search_sites(self, url, content, keywords: List[str]): + @staticmethod + def search_sites(url, content, keywords: List[str]): results = [] for keyword in keywords: - if keyword in content.values(): + if keyword in content: results.append((url, keyword)) return results diff --git a/src/cache/www.patricematz.de/2022-10-15_15-40-54.json b/src/cache/www.patricematz.de/2022-10-15_15-40-54.json deleted file mode 100644 index 82c72e4..0000000 --- a/src/cache/www.patricematz.de/2022-10-15_15-40-54.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "https://www.patricematz.de/": [ - "https://www.patricematz.de/", - "https://www.linkedin.com/in/patrice-matz-b73b6814a/", - "https://github.com/Askill", - "https://www.patricematz.de/images/praktikum.pdf", - "https://www.patricematz.de/images/bachelor.pdf", - "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", - "https://irs.projects.patricematz.de", - "https://github.com/Askill/Inverse-Rezeptsuche", - "https://irs.projects.patricematz.de/", - "https://github.com/Askill/Video-Synopsis", - "https://github.com/Askill/UI", - "https://github.com/Askill/Photo-Wall", - "https://www.patricematz.de/photowall/demo/", - "https://github.com/Askill/Flask-URL-Checker", - "https://patricematz.de/starmapper.htm" - ], - "https://www.patricematz.de/photowall/demo/": [ - "javascript:void(0)" - ], - "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], - "https://www.patricematz.de/images/bachelor.pdf": [], - "https://www.patricematz.de/images/praktikum.pdf": [] -} \ No newline at end of file