working, but duplicate results

2022-10-16 14:57:55 +02:00 · 2022-10-16 14:57:55 +02:00 · 7202c0fe3e
parent 1ec5f320e9
commit 7202c0fe3e
3 changed files with 20 additions and 39 deletions
--- a/src/SiteReader.py
+++ b/src/SiteReader.py
@ -51,10 +51,10 @@ class SiteReader:
        downloaded_url = trafilatura.fetch_url(url)
        try:
-            a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True, include_comments=False,
+            a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False,
                                    date_extraction_params={'extensive_search': True, 'original_date': True})
        except AttributeError:
-            a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True,
+            a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True,
                                    date_extraction_params={'extensive_search': True, 'original_date': True})
        if a:
            json_output = json.loads(a)
--- a/src/Watcher.py
+++ b/src/Watcher.py
@ -1,6 +1,7 @@
 import time
 from datetime import datetime
 from typing import List, Dict, Optional
 from deepdiff import DeepDiff
 from src.Crawler import Crawler
 from src.SiteReader import SiteReader
@ -31,10 +32,11 @@ class Watcher:
                crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
            contents = [self.get_new_content(site) for site in crawled_sites]
-            contents = [x for x in contents if x is not None]
+            contents = [x for x in contents if x is not None and x is not {}]
            matches = []
-            for url, content in contents.items():
+            for content in contents:
-                matches.append(self.search_sites(url, content, keywords))
+                for url, c in content.items():
                    matches.append(self.search_sites(url, c, keywords))
            print(matches)
            time.sleep(sleep)
@ -42,21 +44,25 @@ class Watcher:
    def remove_protocol(site):
        return site.split('/')[2]
-    def get_new_content(self, url) -> Optional[List[str]]:
+    def get_new_content(self, url) -> Dict[str, str]:
        """ get all past iterations of a site by the fully qualified domain name """
        list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
-        if not len(list_of_files) >= 2:
+
-            return None
+        if len(list_of_files) >= 2:
-        prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
+            prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
-        current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
+            current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
-        news = dict(set(prev_version.items()) ^ set(current_version.items()))
+            news = DeepDiff(prev_version, current_version, ignore_order=True)
-        sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))
+        else:
            news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
        sites_contents = self.site_reader.get_sites_content_static(list(news.keys()))
        return sites_contents
-    def search_sites(self, url, content, keywords: List[str]):
+    @staticmethod
    def search_sites(url, content, keywords: List[str]):
        results = []
        for keyword in keywords:
-            if keyword in content.values():
+            if keyword in content:
                results.append((url, keyword))
        return results
--- a/src/cache/www.patricematz.de/2022-10-15_15-40-54.json
+++ b/src/cache/www.patricematz.de/2022-10-15_15-40-54.json
@ -1,25 +0,0 @@
 {
  "https://www.patricematz.de/": [
    "https://www.patricematz.de/",
    "https://www.linkedin.com/in/patrice-matz-b73b6814a/",
    "https://github.com/Askill",
    "https://www.patricematz.de/images/praktikum.pdf",
    "https://www.patricematz.de/images/bachelor.pdf",
    "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf",
    "https://irs.projects.patricematz.de",
    "https://github.com/Askill/Inverse-Rezeptsuche",
    "https://irs.projects.patricematz.de/",
    "https://github.com/Askill/Video-Synopsis",
    "https://github.com/Askill/UI",
    "https://github.com/Askill/Photo-Wall",
    "https://www.patricematz.de/photowall/demo/",
    "https://github.com/Askill/Flask-URL-Checker",
    "https://patricematz.de/starmapper.htm"
  ],
  "https://www.patricematz.de/photowall/demo/": [
    "javascript:void(0)"
  ],
  "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [],
  "https://www.patricematz.de/images/bachelor.pdf": [],
  "https://www.patricematz.de/images/praktikum.pdf": []
 }