This commit is contained in:
Askill 2022-10-15 15:38:58 +02:00
parent 379381f0eb
commit 0eb5bde3be
11 changed files with 71 additions and 36 deletions

3
.gitignore vendored
View File

@ -1,2 +1,3 @@
venv/**
.idea/**
.idea/**
**__pycache__**

1
keywords.txt Normal file
View File

@ -0,0 +1 @@
Oktober

4
main.py Normal file
View File

@ -0,0 +1,4 @@
from src.Watcher import Watcher
if __name__ == "__main__":
Watcher("../sites.txt", "../keywords.txt").watch()

1
sites.txt Normal file
View File

@ -0,0 +1 @@
https://www.patricematz.de

View File

@ -4,11 +4,11 @@ from urllib.parse import urljoin
from lxml import html
import requests
import logging
from pathlib import Path
class Crawler:
url = "" # the url of the website to be checked
links = dict() # dic. with all sites and urls on those sites
_links = dict() # dic. with all sites and urls on those sites
header_values = {
'Connection:': 'Keep-alive',
'name': 'Michael Foord',
@ -28,26 +28,30 @@ class Crawler:
self.logger = logging.Logger(
name="star_crawler", level=logging.INFO)
def get_nodes(self):
return self._links
def persist(self, path):
with open(path, 'w') as fp:
json.dump(self.links, fp)
Path("/".join(path.split("/")[:-1])).mkdir(parents=True, exist_ok=True)
with open(path, 'w+') as fp:
json.dump(self._links, fp)
def load_site(self, path):
with open(path, 'r') as fp:
self.links = json.load(fp)
self._links = json.load(fp)
def run(self, root, limit, sleep_time=0):
self.url = root
unchecked = [root]
while unchecked and len(self.links) < limit:
while unchecked and len(self._links) < limit:
root = unchecked.pop()
if root in self.links or self.url.rsplit('/')[2] not in root:
if root in self._links or self.url.rsplit('/')[2] not in root:
continue
if "https" not in root:
continue
clean = False
clean = True
for element in self.exclude:
if element in root:
clean = False
@ -57,30 +61,30 @@ class Crawler:
if not clean:
continue
self.logger.info(f"{len(self.links)} {root}")
self.logger.info(f"{len(self._links)} {root}")
try:
site = requests.get(root)
tree = html.fromstring(site.content)
links = tree.xpath('//a/@href')
_links = tree.xpath('//a/@href')
except:
continue
nlinks = []
for link in links:
if link not in nlinks:
n_links = []
for link in _links:
if link not in n_links:
if link.startswith("http"):
nlinks.append(link)
n_links.append(link)
else:
nlinks.append(urljoin(site.url, link))
n_links.append(urljoin(site.url, link))
unchecked += nlinks
self.links[root] = nlinks
unchecked += n_links
self._links[root] = n_links
sleep(sleep_time)
def getNodesEdges(self):
nodes = []
edges = []
for key, value in self.links.items():
for key, value in self._links.items():
nodes.append(key)
for edge in value:
edges.append([key, edge])

View File

@ -1,5 +1,6 @@
import json
import os
from typing import List
from typing import List, Optional
class SiteStore:
@ -7,9 +8,12 @@ class SiteStore:
pass
@staticmethod
def get_site_history(fqdn) -> List[str]:
cache_path = f"./cached/{fqdn}"
def get_site_history(cache_path) -> Optional[list[str]]:
if not os.path.isdir(cache_path):
return [""]
return None
return sorted(os.listdir(cache_path))
@staticmethod
def get_site_links(path):
with open(path, 'r') as fp:
return json.load(fp)

View File

@ -1,16 +1,18 @@
import time
from datetime import datetime
from typing import List, Dict, Optional
from typing import List, Dict
from src.Crawler import Crawler
from src.SiteReader import SiteReader
from src.SiteStore import SiteStore
class Watcher:
def __init__(self) -> None:
def __init__(self, sites_source_path, keywords_source_path) -> None:
self.site_store = SiteStore()
self.site_reader = SiteReader()
self.keywords_source_path = ""
self.sites_source_path = ""
self.keywords_source_path = keywords_source_path
self.sites_source_path = sites_source_path
def read_txt_file(self, path):
with open(path) as f:
@ -21,18 +23,32 @@ class Watcher:
keywords = self.read_txt_file(self.keywords_source_path)
sites = self.read_txt_file(self.sites_source_path)
contents = [self.get_new_content(site) for site in sites]
keywords = [x for x in self.get_new_content(keyword) for keyword in keywords]
crawler = Crawler()
crawled_sites = []
for site in sites:
crawler.run(site, 10)
crawled_sites += crawler.get_nodes()
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
contents = [self.get_new_content(site) for site in crawled_sites]
contents = [x for x in contents if x is not None]
matches = []
for url, content in contents.items():
matches.append(self.search_sites(url, content, keywords))
print(matches)
def get_new_content(self, fqdm) -> List[str]:
time.sleep(3600)
@staticmethod
def remove_protocol(site):
return site.split('/')[2]
def get_new_content(self, url) -> Optional[List[str]]:
""" get all past iterations of a site by the fully qualified domain name """
list_of_files = self.site_store.get_site_history(fqdm)
prev_version = list_of_files[-2]
current_version = list_of_files[-1]
list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
if not len(list_of_files) >= 2:
return None
prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
news = dict(set(prev_version.items()) ^ set(current_version.items()))
sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))
@ -43,4 +59,4 @@ class Watcher:
for keyword in keywords:
if keyword in content.values():
results.append((url, keyword))
return results
return results

View File

@ -0,0 +1 @@
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}

View File

@ -0,0 +1 @@
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}

View File

@ -0,0 +1 @@
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}

View File

@ -0,0 +1 @@
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}