mirror of https://github.com/Askill/optar.git
wip
This commit is contained in:
parent
379381f0eb
commit
0eb5bde3be
|
|
@ -1,2 +1,3 @@
|
||||||
venv/**
|
venv/**
|
||||||
.idea/**
|
.idea/**
|
||||||
|
**__pycache__**
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
Oktober
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
from src.Watcher import Watcher
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
Watcher("../sites.txt", "../keywords.txt").watch()
|
||||||
|
|
@ -4,11 +4,11 @@ from urllib.parse import urljoin
|
||||||
from lxml import html
|
from lxml import html
|
||||||
import requests
|
import requests
|
||||||
import logging
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
class Crawler:
|
class Crawler:
|
||||||
url = "" # the url of the website to be checked
|
url = "" # the url of the website to be checked
|
||||||
links = dict() # dic. with all sites and urls on those sites
|
_links = dict() # dic. with all sites and urls on those sites
|
||||||
header_values = {
|
header_values = {
|
||||||
'Connection:': 'Keep-alive',
|
'Connection:': 'Keep-alive',
|
||||||
'name': 'Michael Foord',
|
'name': 'Michael Foord',
|
||||||
|
|
@ -28,26 +28,30 @@ class Crawler:
|
||||||
self.logger = logging.Logger(
|
self.logger = logging.Logger(
|
||||||
name="star_crawler", level=logging.INFO)
|
name="star_crawler", level=logging.INFO)
|
||||||
|
|
||||||
|
def get_nodes(self):
|
||||||
|
return self._links
|
||||||
|
|
||||||
def persist(self, path):
|
def persist(self, path):
|
||||||
with open(path, 'w') as fp:
|
Path("/".join(path.split("/")[:-1])).mkdir(parents=True, exist_ok=True)
|
||||||
json.dump(self.links, fp)
|
with open(path, 'w+') as fp:
|
||||||
|
json.dump(self._links, fp)
|
||||||
|
|
||||||
def load_site(self, path):
|
def load_site(self, path):
|
||||||
with open(path, 'r') as fp:
|
with open(path, 'r') as fp:
|
||||||
self.links = json.load(fp)
|
self._links = json.load(fp)
|
||||||
|
|
||||||
def run(self, root, limit, sleep_time=0):
|
def run(self, root, limit, sleep_time=0):
|
||||||
self.url = root
|
self.url = root
|
||||||
unchecked = [root]
|
unchecked = [root]
|
||||||
|
|
||||||
while unchecked and len(self.links) < limit:
|
while unchecked and len(self._links) < limit:
|
||||||
root = unchecked.pop()
|
root = unchecked.pop()
|
||||||
if root in self.links or self.url.rsplit('/')[2] not in root:
|
if root in self._links or self.url.rsplit('/')[2] not in root:
|
||||||
continue
|
continue
|
||||||
if "https" not in root:
|
if "https" not in root:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
clean = False
|
clean = True
|
||||||
for element in self.exclude:
|
for element in self.exclude:
|
||||||
if element in root:
|
if element in root:
|
||||||
clean = False
|
clean = False
|
||||||
|
|
@ -57,30 +61,30 @@ class Crawler:
|
||||||
if not clean:
|
if not clean:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self.logger.info(f"{len(self.links)} {root}")
|
self.logger.info(f"{len(self._links)} {root}")
|
||||||
try:
|
try:
|
||||||
site = requests.get(root)
|
site = requests.get(root)
|
||||||
tree = html.fromstring(site.content)
|
tree = html.fromstring(site.content)
|
||||||
links = tree.xpath('//a/@href')
|
_links = tree.xpath('//a/@href')
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
nlinks = []
|
n_links = []
|
||||||
for link in links:
|
for link in _links:
|
||||||
if link not in nlinks:
|
if link not in n_links:
|
||||||
if link.startswith("http"):
|
if link.startswith("http"):
|
||||||
nlinks.append(link)
|
n_links.append(link)
|
||||||
else:
|
else:
|
||||||
nlinks.append(urljoin(site.url, link))
|
n_links.append(urljoin(site.url, link))
|
||||||
|
|
||||||
unchecked += nlinks
|
unchecked += n_links
|
||||||
self.links[root] = nlinks
|
self._links[root] = n_links
|
||||||
sleep(sleep_time)
|
sleep(sleep_time)
|
||||||
|
|
||||||
def getNodesEdges(self):
|
def getNodesEdges(self):
|
||||||
nodes = []
|
nodes = []
|
||||||
edges = []
|
edges = []
|
||||||
for key, value in self.links.items():
|
for key, value in self._links.items():
|
||||||
nodes.append(key)
|
nodes.append(key)
|
||||||
for edge in value:
|
for edge in value:
|
||||||
edges.append([key, edge])
|
edges.append([key, edge])
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
class SiteStore:
|
class SiteStore:
|
||||||
|
|
@ -7,9 +8,12 @@ class SiteStore:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_site_history(fqdn) -> List[str]:
|
def get_site_history(cache_path) -> Optional[list[str]]:
|
||||||
cache_path = f"./cached/{fqdn}"
|
|
||||||
if not os.path.isdir(cache_path):
|
if not os.path.isdir(cache_path):
|
||||||
return [""]
|
return None
|
||||||
return sorted(os.listdir(cache_path))
|
return sorted(os.listdir(cache_path))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_site_links(path):
|
||||||
|
with open(path, 'r') as fp:
|
||||||
|
return json.load(fp)
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,18 @@
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
from typing import List, Dict
|
from src.Crawler import Crawler
|
||||||
|
|
||||||
from src.SiteReader import SiteReader
|
from src.SiteReader import SiteReader
|
||||||
from src.SiteStore import SiteStore
|
from src.SiteStore import SiteStore
|
||||||
|
|
||||||
|
|
||||||
class Watcher:
|
class Watcher:
|
||||||
def __init__(self) -> None:
|
def __init__(self, sites_source_path, keywords_source_path) -> None:
|
||||||
self.site_store = SiteStore()
|
self.site_store = SiteStore()
|
||||||
self.site_reader = SiteReader()
|
self.site_reader = SiteReader()
|
||||||
self.keywords_source_path = ""
|
self.keywords_source_path = keywords_source_path
|
||||||
self.sites_source_path = ""
|
self.sites_source_path = sites_source_path
|
||||||
|
|
||||||
def read_txt_file(self, path):
|
def read_txt_file(self, path):
|
||||||
with open(path) as f:
|
with open(path) as f:
|
||||||
|
|
@ -21,18 +23,32 @@ class Watcher:
|
||||||
keywords = self.read_txt_file(self.keywords_source_path)
|
keywords = self.read_txt_file(self.keywords_source_path)
|
||||||
sites = self.read_txt_file(self.sites_source_path)
|
sites = self.read_txt_file(self.sites_source_path)
|
||||||
|
|
||||||
contents = [self.get_new_content(site) for site in sites]
|
crawler = Crawler()
|
||||||
keywords = [x for x in self.get_new_content(keyword) for keyword in keywords]
|
crawled_sites = []
|
||||||
|
for site in sites:
|
||||||
|
crawler.run(site, 10)
|
||||||
|
crawled_sites += crawler.get_nodes()
|
||||||
|
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
|
||||||
|
|
||||||
|
contents = [self.get_new_content(site) for site in crawled_sites]
|
||||||
|
contents = [x for x in contents if x is not None]
|
||||||
matches = []
|
matches = []
|
||||||
for url, content in contents.items():
|
for url, content in contents.items():
|
||||||
matches.append(self.search_sites(url, content, keywords))
|
matches.append(self.search_sites(url, content, keywords))
|
||||||
print(matches)
|
print(matches)
|
||||||
|
time.sleep(3600)
|
||||||
|
|
||||||
def get_new_content(self, fqdm) -> List[str]:
|
@staticmethod
|
||||||
|
def remove_protocol(site):
|
||||||
|
return site.split('/')[2]
|
||||||
|
|
||||||
|
def get_new_content(self, url) -> Optional[List[str]]:
|
||||||
""" get all past iterations of a site by the fully qualified domain name """
|
""" get all past iterations of a site by the fully qualified domain name """
|
||||||
list_of_files = self.site_store.get_site_history(fqdm)
|
list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
|
||||||
prev_version = list_of_files[-2]
|
if not len(list_of_files) >= 2:
|
||||||
current_version = list_of_files[-1]
|
return None
|
||||||
|
prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
|
||||||
|
current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
|
||||||
news = dict(set(prev_version.items()) ^ set(current_version.items()))
|
news = dict(set(prev_version.items()) ^ set(current_version.items()))
|
||||||
sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))
|
sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
{"https://www.patricematz.de": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": [], "https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"]}
|
||||||
Loading…
Reference in New Issue