working, but duplicate results

This commit is contained in:
Askill 2022-10-16 14:57:55 +02:00
parent 1ec5f320e9
commit 7202c0fe3e
3 changed files with 20 additions and 39 deletions

View File

@ -51,10 +51,10 @@ class SiteReader:
downloaded_url = trafilatura.fetch_url(url) downloaded_url = trafilatura.fetch_url(url)
try: try:
a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True, include_comments=False, a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False,
date_extraction_params={'extensive_search': True, 'original_date': True}) date_extraction_params={'extensive_search': True, 'original_date': True})
except AttributeError: except AttributeError:
a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True, a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True,
date_extraction_params={'extensive_search': True, 'original_date': True}) date_extraction_params={'extensive_search': True, 'original_date': True})
if a: if a:
json_output = json.loads(a) json_output = json.loads(a)

View File

@ -1,6 +1,7 @@
import time import time
from datetime import datetime from datetime import datetime
from typing import List, Dict, Optional from typing import List, Dict, Optional
from deepdiff import DeepDiff
from src.Crawler import Crawler from src.Crawler import Crawler
from src.SiteReader import SiteReader from src.SiteReader import SiteReader
@ -31,10 +32,11 @@ class Watcher:
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json") crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
contents = [self.get_new_content(site) for site in crawled_sites] contents = [self.get_new_content(site) for site in crawled_sites]
contents = [x for x in contents if x is not None] contents = [x for x in contents if x is not None and x is not {}]
matches = [] matches = []
for url, content in contents.items(): for content in contents:
matches.append(self.search_sites(url, content, keywords)) for url, c in content.items():
matches.append(self.search_sites(url, c, keywords))
print(matches) print(matches)
time.sleep(sleep) time.sleep(sleep)
@ -42,21 +44,25 @@ class Watcher:
def remove_protocol(site): def remove_protocol(site):
return site.split('/')[2] return site.split('/')[2]
def get_new_content(self, url) -> Optional[List[str]]: def get_new_content(self, url) -> Dict[str, str]:
""" get all past iterations of a site by the fully qualified domain name """ """ get all past iterations of a site by the fully qualified domain name """
list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/") list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
if not len(list_of_files) >= 2:
return None if len(list_of_files) >= 2:
prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}") prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}") current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
news = dict(set(prev_version.items()) ^ set(current_version.items())) news = DeepDiff(prev_version, current_version, ignore_order=True)
sites_contents = self.site_reader.get_sites_content_static(sum(news.items())) else:
news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
sites_contents = self.site_reader.get_sites_content_static(list(news.keys()))
return sites_contents return sites_contents
def search_sites(self, url, content, keywords: List[str]): @staticmethod
def search_sites(url, content, keywords: List[str]):
results = [] results = []
for keyword in keywords: for keyword in keywords:
if keyword in content.values(): if keyword in content:
results.append((url, keyword)) results.append((url, keyword))
return results return results

View File

@ -1,25 +0,0 @@
{
"https://www.patricematz.de/": [
"https://www.patricematz.de/",
"https://www.linkedin.com/in/patrice-matz-b73b6814a/",
"https://github.com/Askill",
"https://www.patricematz.de/images/praktikum.pdf",
"https://www.patricematz.de/images/bachelor.pdf",
"https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf",
"https://irs.projects.patricematz.de",
"https://github.com/Askill/Inverse-Rezeptsuche",
"https://irs.projects.patricematz.de/",
"https://github.com/Askill/Video-Synopsis",
"https://github.com/Askill/UI",
"https://github.com/Askill/Photo-Wall",
"https://www.patricematz.de/photowall/demo/",
"https://github.com/Askill/Flask-URL-Checker",
"https://patricematz.de/starmapper.htm"
],
"https://www.patricematz.de/photowall/demo/": [
"javascript:void(0)"
],
"https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [],
"https://www.patricematz.de/images/bachelor.pdf": [],
"https://www.patricematz.de/images/praktikum.pdf": []
}