From 7202c0fe3eacaf7ddd58fccc5182c70180043c9a Mon Sep 17 00:00:00 2001
From: Askill <ofjokg@gmail.com>
Date: Sun, 16 Oct 2022 14:57:55 +0200
Subject: [PATCH] working, but duplicate results

---
 src/SiteReader.py                             |  4 +--
 src/Watcher.py                                | 30 +++++++++++--------
 .../2022-10-15_15-40-54.json                  | 25 ----------------
 3 files changed, 20 insertions(+), 39 deletions(-)
 delete mode 100644 src/cache/www.patricematz.de/2022-10-15_15-40-54.json

diff --git a/src/SiteReader.py b/src/SiteReader.py
index 6af81bb..24f03dc 100644
--- a/src/SiteReader.py
+++ b/src/SiteReader.py
@@ -51,10 +51,10 @@ class SiteReader:
 
         downloaded_url = trafilatura.fetch_url(url)
         try:
-            a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True, include_comments=False,
+            a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False,
                                     date_extraction_params={'extensive_search': True, 'original_date': True})
         except AttributeError:
-            a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True,
+            a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True,
                                     date_extraction_params={'extensive_search': True, 'original_date': True})
         if a:
             json_output = json.loads(a)
diff --git a/src/Watcher.py b/src/Watcher.py
index 4b0f7fc..69690a9 100644
--- a/src/Watcher.py
+++ b/src/Watcher.py
@@ -1,6 +1,7 @@
 import time
 from datetime import datetime
 from typing import List, Dict, Optional
+from deepdiff import DeepDiff
 
 from src.Crawler import Crawler
 from src.SiteReader import SiteReader
@@ -31,10 +32,11 @@ class Watcher:
                 crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
 
             contents = [self.get_new_content(site) for site in crawled_sites]
-            contents = [x for x in contents if x is not None]
+            contents = [x for x in contents if x is not None and x is not {}]
             matches = []
-            for url, content in contents.items():
-                matches.append(self.search_sites(url, content, keywords))
+            for content in contents:
+                for url, c in content.items():
+                    matches.append(self.search_sites(url, c, keywords))
             print(matches)
             time.sleep(sleep)
 
@@ -42,21 +44,25 @@ class Watcher:
     def remove_protocol(site):
         return site.split('/')[2]
 
-    def get_new_content(self, url) -> Optional[List[str]]:
+    def get_new_content(self, url) -> Dict[str, str]:
         """ get all past iterations of a site by the fully qualified domain name """
         list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
-        if not len(list_of_files) >= 2:
-            return None
-        prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
-        current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
-        news = dict(set(prev_version.items()) ^ set(current_version.items()))
-        sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))
+
+        if len(list_of_files) >= 2:
+            prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
+            current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
+            news = DeepDiff(prev_version, current_version, ignore_order=True)
+        else:
+            news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
+
+        sites_contents = self.site_reader.get_sites_content_static(list(news.keys()))
 
         return sites_contents
 
-    def search_sites(self, url, content, keywords: List[str]):
+    @staticmethod
+    def search_sites(url, content, keywords: List[str]):
         results = []
         for keyword in keywords:
-            if keyword in content.values():
+            if keyword in content:
                 results.append((url, keyword))
         return results
diff --git a/src/cache/www.patricematz.de/2022-10-15_15-40-54.json b/src/cache/www.patricematz.de/2022-10-15_15-40-54.json
deleted file mode 100644
index 82c72e4..0000000
--- a/src/cache/www.patricematz.de/2022-10-15_15-40-54.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "https://www.patricematz.de/": [
-    "https://www.patricematz.de/",
-    "https://www.linkedin.com/in/patrice-matz-b73b6814a/",
-    "https://github.com/Askill",
-    "https://www.patricematz.de/images/praktikum.pdf",
-    "https://www.patricematz.de/images/bachelor.pdf",
-    "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf",
-    "https://irs.projects.patricematz.de",
-    "https://github.com/Askill/Inverse-Rezeptsuche",
-    "https://irs.projects.patricematz.de/",
-    "https://github.com/Askill/Video-Synopsis",
-    "https://github.com/Askill/UI",
-    "https://github.com/Askill/Photo-Wall",
-    "https://www.patricematz.de/photowall/demo/",
-    "https://github.com/Askill/Flask-URL-Checker",
-    "https://patricematz.de/starmapper.htm"
-  ],
-  "https://www.patricematz.de/photowall/demo/": [
-    "javascript:void(0)"
-  ],
-  "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [],
-  "https://www.patricematz.de/images/bachelor.pdf": [],
-  "https://www.patricematz.de/images/praktikum.pdf": []
-}
\ No newline at end of file