mirror of https://github.com/Askill/optar.git
working, but duplicate results
This commit is contained in:
parent
1ec5f320e9
commit
7202c0fe3e
|
|
@ -51,10 +51,10 @@ class SiteReader:
|
||||||
|
|
||||||
downloaded_url = trafilatura.fetch_url(url)
|
downloaded_url = trafilatura.fetch_url(url)
|
||||||
try:
|
try:
|
||||||
a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True, include_comments=False,
|
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False,
|
||||||
date_extraction_params={'extensive_search': True, 'original_date': True})
|
date_extraction_params={'extensive_search': True, 'original_date': True})
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True,
|
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True,
|
||||||
date_extraction_params={'extensive_search': True, 'original_date': True})
|
date_extraction_params={'extensive_search': True, 'original_date': True})
|
||||||
if a:
|
if a:
|
||||||
json_output = json.loads(a)
|
json_output = json.loads(a)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Dict, Optional
|
from typing import List, Dict, Optional
|
||||||
|
from deepdiff import DeepDiff
|
||||||
|
|
||||||
from src.Crawler import Crawler
|
from src.Crawler import Crawler
|
||||||
from src.SiteReader import SiteReader
|
from src.SiteReader import SiteReader
|
||||||
|
|
@ -31,10 +32,11 @@ class Watcher:
|
||||||
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
|
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
|
||||||
|
|
||||||
contents = [self.get_new_content(site) for site in crawled_sites]
|
contents = [self.get_new_content(site) for site in crawled_sites]
|
||||||
contents = [x for x in contents if x is not None]
|
contents = [x for x in contents if x is not None and x is not {}]
|
||||||
matches = []
|
matches = []
|
||||||
for url, content in contents.items():
|
for content in contents:
|
||||||
matches.append(self.search_sites(url, content, keywords))
|
for url, c in content.items():
|
||||||
|
matches.append(self.search_sites(url, c, keywords))
|
||||||
print(matches)
|
print(matches)
|
||||||
time.sleep(sleep)
|
time.sleep(sleep)
|
||||||
|
|
||||||
|
|
@ -42,21 +44,25 @@ class Watcher:
|
||||||
def remove_protocol(site):
|
def remove_protocol(site):
|
||||||
return site.split('/')[2]
|
return site.split('/')[2]
|
||||||
|
|
||||||
def get_new_content(self, url) -> Optional[List[str]]:
|
def get_new_content(self, url) -> Dict[str, str]:
|
||||||
""" get all past iterations of a site by the fully qualified domain name """
|
""" get all past iterations of a site by the fully qualified domain name """
|
||||||
list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
|
list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
|
||||||
if not len(list_of_files) >= 2:
|
|
||||||
return None
|
if len(list_of_files) >= 2:
|
||||||
prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
|
prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
|
||||||
current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
|
current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||||
news = dict(set(prev_version.items()) ^ set(current_version.items()))
|
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
||||||
sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))
|
else:
|
||||||
|
news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||||
|
|
||||||
|
sites_contents = self.site_reader.get_sites_content_static(list(news.keys()))
|
||||||
|
|
||||||
return sites_contents
|
return sites_contents
|
||||||
|
|
||||||
def search_sites(self, url, content, keywords: List[str]):
|
@staticmethod
|
||||||
|
def search_sites(url, content, keywords: List[str]):
|
||||||
results = []
|
results = []
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
if keyword in content.values():
|
if keyword in content:
|
||||||
results.append((url, keyword))
|
results.append((url, keyword))
|
||||||
return results
|
return results
|
||||||
|
|
|
||||||
|
|
@ -1,25 +0,0 @@
|
||||||
{
|
|
||||||
"https://www.patricematz.de/": [
|
|
||||||
"https://www.patricematz.de/",
|
|
||||||
"https://www.linkedin.com/in/patrice-matz-b73b6814a/",
|
|
||||||
"https://github.com/Askill",
|
|
||||||
"https://www.patricematz.de/images/praktikum.pdf",
|
|
||||||
"https://www.patricematz.de/images/bachelor.pdf",
|
|
||||||
"https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf",
|
|
||||||
"https://irs.projects.patricematz.de",
|
|
||||||
"https://github.com/Askill/Inverse-Rezeptsuche",
|
|
||||||
"https://irs.projects.patricematz.de/",
|
|
||||||
"https://github.com/Askill/Video-Synopsis",
|
|
||||||
"https://github.com/Askill/UI",
|
|
||||||
"https://github.com/Askill/Photo-Wall",
|
|
||||||
"https://www.patricematz.de/photowall/demo/",
|
|
||||||
"https://github.com/Askill/Flask-URL-Checker",
|
|
||||||
"https://patricematz.de/starmapper.htm"
|
|
||||||
],
|
|
||||||
"https://www.patricematz.de/photowall/demo/": [
|
|
||||||
"javascript:void(0)"
|
|
||||||
],
|
|
||||||
"https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [],
|
|
||||||
"https://www.patricematz.de/images/bachelor.pdf": [],
|
|
||||||
"https://www.patricematz.de/images/praktikum.pdf": []
|
|
||||||
}
|
|
||||||
Loading…
Reference in New Issue