diff --git a/.gitignore b/.gitignore index 83a3a05..12fd4e3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ venv/** -.idea/** \ No newline at end of file +.idea/** +**__pycache__** \ No newline at end of file diff --git a/keywords.txt b/keywords.txt new file mode 100644 index 0000000..104fafc --- /dev/null +++ b/keywords.txt @@ -0,0 +1 @@ +Oktober \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..8c1f728 --- /dev/null +++ b/main.py @@ -0,0 +1,4 @@ +from src.Watcher import Watcher + +if __name__ == "__main__": + Watcher("../sites.txt", "../keywords.txt").watch() \ No newline at end of file diff --git a/sites.txt b/sites.txt new file mode 100644 index 0000000..bfa2eeb --- /dev/null +++ b/sites.txt @@ -0,0 +1 @@ +https://www.patricematz.de \ No newline at end of file diff --git a/src/Crawler.py b/src/Crawler.py index 67001bf..eab1800 100644 --- a/src/Crawler.py +++ b/src/Crawler.py @@ -4,11 +4,11 @@ from urllib.parse import urljoin from lxml import html import requests import logging - +from pathlib import Path class Crawler: url = "" # the url of the website to be checked - links = dict() # dic. with all sites and urls on those sites + _links = dict() # dic. with all sites and urls on those sites header_values = { 'Connection:': 'Keep-alive', 'name': 'Michael Foord', @@ -28,26 +28,30 @@ class Crawler: self.logger = logging.Logger( name="star_crawler", level=logging.INFO) + def get_nodes(self): + return self._links + def persist(self, path): - with open(path, 'w') as fp: - json.dump(self.links, fp) + Path("/".join(path.split("/")[:-1])).mkdir(parents=True, exist_ok=True) + with open(path, 'w+') as fp: + json.dump(self._links, fp) def load_site(self, path): with open(path, 'r') as fp: - self.links = json.load(fp) + self._links = json.load(fp) def run(self, root, limit, sleep_time=0): self.url = root unchecked = [root] - while unchecked and len(self.links) < limit: + while unchecked and len(self._links) < limit: root = unchecked.pop() - if root in self.links or self.url.rsplit('/')[2] not in root: + if root in self._links or self.url.rsplit('/')[2] not in root: continue if "https" not in root: continue - clean = False + clean = True for element in self.exclude: if element in root: clean = False @@ -57,30 +61,30 @@ class Crawler: if not clean: continue - self.logger.info(f"{len(self.links)} {root}") + self.logger.info(f"{len(self._links)} {root}") try: site = requests.get(root) tree = html.fromstring(site.content) - links = tree.xpath('//a/@href') + _links = tree.xpath('//a/@href') except: continue - nlinks = [] - for link in links: - if link not in nlinks: + n_links = [] + for link in _links: + if link not in n_links: if link.startswith("http"): - nlinks.append(link) + n_links.append(link) else: - nlinks.append(urljoin(site.url, link)) + n_links.append(urljoin(site.url, link)) - unchecked += nlinks - self.links[root] = nlinks + unchecked += n_links + self._links[root] = n_links sleep(sleep_time) def getNodesEdges(self): nodes = [] edges = [] - for key, value in self.links.items(): + for key, value in self._links.items(): nodes.append(key) for edge in value: edges.append([key, edge]) diff --git a/src/SiteStore.py b/src/SiteStore.py index f94aa84..322d6cf 100644 --- a/src/SiteStore.py +++ b/src/SiteStore.py @@ -1,5 +1,6 @@ +import json import os -from typing import List +from typing import List, Optional class SiteStore: @@ -7,9 +8,12 @@ class SiteStore: pass @staticmethod - def get_site_history(fqdn) -> List[str]: - cache_path = f"./cached/{fqdn}" + def get_site_history(cache_path) -> Optional[list[str]]: if not os.path.isdir(cache_path): - return [""] + return None return sorted(os.listdir(cache_path)) + @staticmethod + def get_site_links(path): + with open(path, 'r') as fp: + return json.load(fp) diff --git a/src/Watcher.py b/src/Watcher.py index afe7356..e4c7182 100644 --- a/src/Watcher.py +++ b/src/Watcher.py @@ -1,16 +1,18 @@ +import time +from datetime import datetime +from typing import List, Dict, Optional -from typing import List, Dict - +from src.Crawler import Crawler from src.SiteReader import SiteReader from src.SiteStore import SiteStore class Watcher: - def __init__(self) -> None: + def __init__(self, sites_source_path, keywords_source_path) -> None: self.site_store = SiteStore() self.site_reader = SiteReader() - self.keywords_source_path = "" - self.sites_source_path = "" + self.keywords_source_path = keywords_source_path + self.sites_source_path = sites_source_path def read_txt_file(self, path): with open(path) as f: @@ -21,18 +23,32 @@ class Watcher: keywords = self.read_txt_file(self.keywords_source_path) sites = self.read_txt_file(self.sites_source_path) - contents = [self.get_new_content(site) for site in sites] - keywords = [x for x in self.get_new_content(keyword) for keyword in keywords] + crawler = Crawler() + crawled_sites = [] + for site in sites: + crawler.run(site, 10) + crawled_sites += crawler.get_nodes() + crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json") + + contents = [self.get_new_content(site) for site in crawled_sites] + contents = [x for x in contents if x is not None] matches = [] for url, content in contents.items(): matches.append(self.search_sites(url, content, keywords)) print(matches) - - def get_new_content(self, fqdm) -> List[str]: + time.sleep(3600) + + @staticmethod + def remove_protocol(site): + return site.split('/')[2] + + def get_new_content(self, url) -> Optional[List[str]]: """ get all past iterations of a site by the fully qualified domain name """ - list_of_files = self.site_store.get_site_history(fqdm) - prev_version = list_of_files[-2] - current_version = list_of_files[-1] + list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/") + if not len(list_of_files) >= 2: + return None + prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}") + current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}") news = dict(set(prev_version.items()) ^ set(current_version.items())) sites_contents = self.site_reader.get_sites_content_static(sum(news.items())) @@ -43,4 +59,4 @@ class Watcher: for keyword in keywords: if keyword in content.values(): results.append((url, keyword)) - return results \ No newline at end of file + return results diff --git a/src/cache/www.patricematz.de/2022-10-15_15-35-49.json b/src/cache/www.patricematz.de/2022-10-15_15-35-49.json new file mode 100644 index 0000000..bcdee74 --- /dev/null +++ b/src/cache/www.patricematz.de/2022-10-15_15-35-49.json @@ -0,0 +1 @@ +{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]} \ No newline at end of file diff --git a/src/cache/www.patricematz.de/2022-10-15_15-36-32.json b/src/cache/www.patricematz.de/2022-10-15_15-36-32.json new file mode 100644 index 0000000..bcdee74 --- /dev/null +++ b/src/cache/www.patricematz.de/2022-10-15_15-36-32.json @@ -0,0 +1 @@ +{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]} \ No newline at end of file diff --git a/src/cache/www.patricematz.de/2022-10-15_15-36-40.json b/src/cache/www.patricematz.de/2022-10-15_15-36-40.json new file mode 100644 index 0000000..bcdee74 --- /dev/null +++ b/src/cache/www.patricematz.de/2022-10-15_15-36-40.json @@ -0,0 +1 @@ +{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]} \ No newline at end of file diff --git a/src/cache/www.patricematz.de/2022-10-15_15-36-59.json b/src/cache/www.patricematz.de/2022-10-15_15-36-59.json new file mode 100644 index 0000000..bcdee74 --- /dev/null +++ b/src/cache/www.patricematz.de/2022-10-15_15-36-59.json @@ -0,0 +1 @@ +{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]} \ No newline at end of file