This commit is contained in:
Askill 2022-10-15 15:38:58 +02:00
parent 379381f0eb
commit 0eb5bde3be
11 changed files with 71 additions and 36 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
venv/** venv/**
.idea/** .idea/**
**__pycache__**

1
keywords.txt Normal file
View File

@ -0,0 +1 @@
Oktober

4
main.py Normal file
View File

@ -0,0 +1,4 @@
from src.Watcher import Watcher
if __name__ == "__main__":
Watcher("../sites.txt", "../keywords.txt").watch()

1
sites.txt Normal file
View File

@ -0,0 +1 @@
https://www.patricematz.de

View File

@ -4,11 +4,11 @@ from urllib.parse import urljoin
from lxml import html from lxml import html
import requests import requests
import logging import logging
from pathlib import Path
class Crawler: class Crawler:
url = "" # the url of the website to be checked url = "" # the url of the website to be checked
links = dict() # dic. with all sites and urls on those sites _links = dict() # dic. with all sites and urls on those sites
header_values = { header_values = {
'Connection:': 'Keep-alive', 'Connection:': 'Keep-alive',
'name': 'Michael Foord', 'name': 'Michael Foord',
@ -28,26 +28,30 @@ class Crawler:
self.logger = logging.Logger( self.logger = logging.Logger(
name="star_crawler", level=logging.INFO) name="star_crawler", level=logging.INFO)
def get_nodes(self):
return self._links
def persist(self, path): def persist(self, path):
with open(path, 'w') as fp: Path("/".join(path.split("/")[:-1])).mkdir(parents=True, exist_ok=True)
json.dump(self.links, fp) with open(path, 'w+') as fp:
json.dump(self._links, fp)
def load_site(self, path): def load_site(self, path):
with open(path, 'r') as fp: with open(path, 'r') as fp:
self.links = json.load(fp) self._links = json.load(fp)
def run(self, root, limit, sleep_time=0): def run(self, root, limit, sleep_time=0):
self.url = root self.url = root
unchecked = [root] unchecked = [root]
while unchecked and len(self.links) < limit: while unchecked and len(self._links) < limit:
root = unchecked.pop() root = unchecked.pop()
if root in self.links or self.url.rsplit('/')[2] not in root: if root in self._links or self.url.rsplit('/')[2] not in root:
continue continue
if "https" not in root: if "https" not in root:
continue continue
clean = False clean = True
for element in self.exclude: for element in self.exclude:
if element in root: if element in root:
clean = False clean = False
@ -57,30 +61,30 @@ class Crawler:
if not clean: if not clean:
continue continue
self.logger.info(f"{len(self.links)} {root}") self.logger.info(f"{len(self._links)} {root}")
try: try:
site = requests.get(root) site = requests.get(root)
tree = html.fromstring(site.content) tree = html.fromstring(site.content)
links = tree.xpath('//a/@href') _links = tree.xpath('//a/@href')
except: except:
continue continue
nlinks = [] n_links = []
for link in links: for link in _links:
if link not in nlinks: if link not in n_links:
if link.startswith("http"): if link.startswith("http"):
nlinks.append(link) n_links.append(link)
else: else:
nlinks.append(urljoin(site.url, link)) n_links.append(urljoin(site.url, link))
unchecked += nlinks unchecked += n_links
self.links[root] = nlinks self._links[root] = n_links
sleep(sleep_time) sleep(sleep_time)
def getNodesEdges(self): def getNodesEdges(self):
nodes = [] nodes = []
edges = [] edges = []
for key, value in self.links.items(): for key, value in self._links.items():
nodes.append(key) nodes.append(key)
for edge in value: for edge in value:
edges.append([key, edge]) edges.append([key, edge])

View File

@ -1,5 +1,6 @@
import json
import os import os
from typing import List from typing import List, Optional
class SiteStore: class SiteStore:
@ -7,9 +8,12 @@ class SiteStore:
pass pass
@staticmethod @staticmethod
def get_site_history(fqdn) -> List[str]: def get_site_history(cache_path) -> Optional[list[str]]:
cache_path = f"./cached/{fqdn}"
if not os.path.isdir(cache_path): if not os.path.isdir(cache_path):
return [""] return None
return sorted(os.listdir(cache_path)) return sorted(os.listdir(cache_path))
@staticmethod
def get_site_links(path):
with open(path, 'r') as fp:
return json.load(fp)

View File

@ -1,16 +1,18 @@
import time
from datetime import datetime
from typing import List, Dict, Optional
from typing import List, Dict from src.Crawler import Crawler
from src.SiteReader import SiteReader from src.SiteReader import SiteReader
from src.SiteStore import SiteStore from src.SiteStore import SiteStore
class Watcher: class Watcher:
def __init__(self) -> None: def __init__(self, sites_source_path, keywords_source_path) -> None:
self.site_store = SiteStore() self.site_store = SiteStore()
self.site_reader = SiteReader() self.site_reader = SiteReader()
self.keywords_source_path = "" self.keywords_source_path = keywords_source_path
self.sites_source_path = "" self.sites_source_path = sites_source_path
def read_txt_file(self, path): def read_txt_file(self, path):
with open(path) as f: with open(path) as f:
@ -21,18 +23,32 @@ class Watcher:
keywords = self.read_txt_file(self.keywords_source_path) keywords = self.read_txt_file(self.keywords_source_path)
sites = self.read_txt_file(self.sites_source_path) sites = self.read_txt_file(self.sites_source_path)
contents = [self.get_new_content(site) for site in sites] crawler = Crawler()
keywords = [x for x in self.get_new_content(keyword) for keyword in keywords] crawled_sites = []
for site in sites:
crawler.run(site, 10)
crawled_sites += crawler.get_nodes()
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
contents = [self.get_new_content(site) for site in crawled_sites]
contents = [x for x in contents if x is not None]
matches = [] matches = []
for url, content in contents.items(): for url, content in contents.items():
matches.append(self.search_sites(url, content, keywords)) matches.append(self.search_sites(url, content, keywords))
print(matches) print(matches)
time.sleep(3600)
def get_new_content(self, fqdm) -> List[str]: @staticmethod
def remove_protocol(site):
return site.split('/')[2]
def get_new_content(self, url) -> Optional[List[str]]:
""" get all past iterations of a site by the fully qualified domain name """ """ get all past iterations of a site by the fully qualified domain name """
list_of_files = self.site_store.get_site_history(fqdm) list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
prev_version = list_of_files[-2] if not len(list_of_files) >= 2:
current_version = list_of_files[-1] return None
prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
news = dict(set(prev_version.items()) ^ set(current_version.items())) news = dict(set(prev_version.items()) ^ set(current_version.items()))
sites_contents = self.site_reader.get_sites_content_static(sum(news.items())) sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))

View File

@ -0,0 +1 @@
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}

View File

@ -0,0 +1 @@
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}

View File

@ -0,0 +1 @@
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}

View File

@ -0,0 +1 @@
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}