wip

2022-10-15 15:38:58 +02:00 · 2022-10-15 15:38:58 +02:00 · 0eb5bde3be
parent 379381f0eb
commit 0eb5bde3be
11 changed files with 71 additions and 36 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 venv/**
-.idea/**
+.idea/**
+**__pycache__**
--- a/keywords.txt
+++ b/keywords.txt
@ -0,0 +1 @@
+Oktober
--- a/main.py
+++ b/main.py
@ -0,0 +1,4 @@
+from src.Watcher import Watcher
+
+if __name__ == "__main__":
+    Watcher("../sites.txt", "../keywords.txt").watch()
--- a/sites.txt
+++ b/sites.txt
@ -0,0 +1 @@
+https://www.patricematz.de
--- a/src/Crawler.py
+++ b/src/Crawler.py
@ -4,11 +4,11 @@ from urllib.parse import urljoin
 from lxml import html
 import requests
 import logging
-
+from pathlib import Path

 class Crawler:
    url = ""                # the url of the website to be checked
-    links = dict()          # dic. with all sites and urls on those sites
+    _links = dict()          # dic. with all sites and urls on those sites
    header_values = {
        'Connection:': 'Keep-alive',
        'name': 'Michael Foord',
@ -28,26 +28,30 @@ class Crawler:
            self.logger = logging.Logger(
                name="star_crawler", level=logging.INFO)

+    def get_nodes(self):
+        return self._links
+
    def persist(self, path):
-        with open(path, 'w') as fp:
-            json.dump(self.links, fp)
+        Path("/".join(path.split("/")[:-1])).mkdir(parents=True, exist_ok=True)
+        with open(path, 'w+') as fp:
+            json.dump(self._links, fp)
            
    def load_site(self, path):
        with open(path, 'r') as fp:
-            self.links = json.load(fp)
+            self._links = json.load(fp)
            
    def run(self, root, limit, sleep_time=0):
        self.url = root
        unchecked = [root]

-        while unchecked and len(self.links) < limit:
+        while unchecked and len(self._links) < limit:
            root = unchecked.pop()
-            if root in self.links or self.url.rsplit('/')[2] not in root:
+            if root in self._links or self.url.rsplit('/')[2] not in root:
                continue
            if "https" not in root:
                continue

-            clean = False
+            clean = True
            for element in self.exclude:
                if element in root:
                    clean = False
@ -57,30 +61,30 @@ class Crawler:
            if not clean:
                continue

-            self.logger.info(f"{len(self.links)} {root}")
+            self.logger.info(f"{len(self._links)} {root}")
            try:
                site = requests.get(root)
                tree = html.fromstring(site.content)
-                links = tree.xpath('//a/@href')
+                _links = tree.xpath('//a/@href')
            except:
                continue

-            nlinks = []
-            for link in links:
-                if link not in nlinks:
+            n_links = []
+            for link in _links:
+                if link not in n_links:
                    if link.startswith("http"):
-                        nlinks.append(link)
+                        n_links.append(link)
                    else:
-                        nlinks.append(urljoin(site.url, link))
+                        n_links.append(urljoin(site.url, link))

-            unchecked += nlinks
-            self.links[root] = nlinks
+            unchecked += n_links
+            self._links[root] = n_links
            sleep(sleep_time)

    def getNodesEdges(self):
        nodes = []
        edges = []
-        for key, value in self.links.items():
+        for key, value in self._links.items():
            nodes.append(key)
            for edge in value:
                edges.append([key, edge])
--- a/src/SiteStore.py
+++ b/src/SiteStore.py
@ -1,5 +1,6 @@
+import json
 import os
-from typing import List
+from typing import List, Optional


 class SiteStore:
@ -7,9 +8,12 @@ class SiteStore:
        pass

    @staticmethod
-    def get_site_history(fqdn) -> List[str]:
-        cache_path = f"./cached/{fqdn}"
+    def get_site_history(cache_path) -> Optional[list[str]]:
        if not os.path.isdir(cache_path):
-               return [""]
+            return None
        return sorted(os.listdir(cache_path))

+    @staticmethod
+    def get_site_links(path):
+        with open(path, 'r') as fp:
+            return json.load(fp)
--- a/src/Watcher.py
+++ b/src/Watcher.py
@ -1,16 +1,18 @@
+import time
+from datetime import datetime
+from typing import List, Dict, Optional

-from typing import List, Dict
-
+from src.Crawler import Crawler
 from src.SiteReader import SiteReader
 from src.SiteStore import SiteStore


 class Watcher:
-    def __init__(self) -> None:
+    def __init__(self, sites_source_path, keywords_source_path) -> None:
        self.site_store = SiteStore()
        self.site_reader = SiteReader()
-        self.keywords_source_path = ""
-        self.sites_source_path = ""
+        self.keywords_source_path = keywords_source_path
+        self.sites_source_path = sites_source_path

    def read_txt_file(self, path):
        with open(path) as f:
@ -21,18 +23,32 @@ class Watcher:
            keywords = self.read_txt_file(self.keywords_source_path)
            sites = self.read_txt_file(self.sites_source_path)

-            contents = [self.get_new_content(site) for site in sites]
-            keywords = [x for x in self.get_new_content(keyword) for keyword in keywords]
+            crawler = Crawler()
+            crawled_sites = []
+            for site in sites:
+                crawler.run(site, 10)
+                crawled_sites += crawler.get_nodes()
+                crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
+
+            contents = [self.get_new_content(site) for site in crawled_sites]
+            contents = [x for x in contents if x is not None]
            matches = []
            for url, content in contents.items():
                matches.append(self.search_sites(url, content, keywords))
            print(matches)
-        
-    def get_new_content(self, fqdm) -> List[str]:
+            time.sleep(3600)
+
+    @staticmethod
+    def remove_protocol(site):
+        return site.split('/')[2]
+
+    def get_new_content(self, url) -> Optional[List[str]]:
        """ get all past iterations of a site by the fully qualified domain name """
-        list_of_files = self.site_store.get_site_history(fqdm)
-        prev_version = list_of_files[-2]
-        current_version = list_of_files[-1]
+        list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
+        if not len(list_of_files) >= 2:
+            return None
+        prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
+        current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
        news = dict(set(prev_version.items()) ^ set(current_version.items()))
        sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))

@ -43,4 +59,4 @@ class Watcher:
        for keyword in keywords:
            if keyword in content.values():
                results.append((url, keyword))
-        return results
+        return results
--- a/src/cache/www.patricematz.de/2022-10-15_15-35-49.json
+++ b/src/cache/www.patricematz.de/2022-10-15_15-35-49.json
@ -0,0 +1 @@
+{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}
--- a/src/cache/www.patricematz.de/2022-10-15_15-36-32.json
+++ b/src/cache/www.patricematz.de/2022-10-15_15-36-32.json
@ -0,0 +1 @@
+{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}
--- a/src/cache/www.patricematz.de/2022-10-15_15-36-40.json
+++ b/src/cache/www.patricematz.de/2022-10-15_15-36-40.json
@ -0,0 +1 @@
+{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}
--- a/src/cache/www.patricematz.de/2022-10-15_15-36-59.json
+++ b/src/cache/www.patricematz.de/2022-10-15_15-36-59.json
@ -0,0 +1 @@
+{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}
				`@ -0,0 +1 @@`
				{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}