mirror of https://github.com/Askill/optar.git
96 lines
2.7 KiB
Python
96 lines
2.7 KiB
Python
|
|
import json
|
||
|
|
from time import sleep
|
||
|
|
from urllib.parse import urljoin
|
||
|
|
from lxml import html
|
||
|
|
import requests
|
||
|
|
import logging
|
||
|
|
|
||
|
|
|
||
|
|
class Crawler:
|
||
|
|
url = "" # the url of the website to be checked
|
||
|
|
links = dict() # dic. with all sites and urls on those sites
|
||
|
|
header_values = {
|
||
|
|
'Connection:': 'Keep-alive',
|
||
|
|
'name': 'Michael Foord',
|
||
|
|
'location': 'Northampton',
|
||
|
|
'language': 'English',
|
||
|
|
'User-Agent': 'Mozilla 4/0'}
|
||
|
|
|
||
|
|
exclude = [
|
||
|
|
]
|
||
|
|
|
||
|
|
def __init__(self, logger=None, exclude=None):
|
||
|
|
if exclude:
|
||
|
|
self.exclude += exclude
|
||
|
|
if logger:
|
||
|
|
self.logger = logger
|
||
|
|
else:
|
||
|
|
self.logger = logging.Logger(
|
||
|
|
name="star_crawler", level=logging.INFO)
|
||
|
|
|
||
|
|
def persist(self, path):
|
||
|
|
with open(path, 'w') as fp:
|
||
|
|
json.dump(self.links, fp)
|
||
|
|
|
||
|
|
def load_site(self, path):
|
||
|
|
with open(path, 'r') as fp:
|
||
|
|
self.links = json.load(fp)
|
||
|
|
|
||
|
|
def run(self, root, limit, sleep_time=0):
|
||
|
|
self.url = root
|
||
|
|
unchecked = [root]
|
||
|
|
|
||
|
|
while unchecked and len(self.links) < limit:
|
||
|
|
root = unchecked.pop()
|
||
|
|
if root in self.links or self.url.rsplit('/')[2] not in root:
|
||
|
|
continue
|
||
|
|
if "https" not in root:
|
||
|
|
continue
|
||
|
|
|
||
|
|
clean = False
|
||
|
|
for element in self.exclude:
|
||
|
|
if element in root:
|
||
|
|
clean = False
|
||
|
|
break
|
||
|
|
else:
|
||
|
|
clean = True
|
||
|
|
if not clean:
|
||
|
|
continue
|
||
|
|
|
||
|
|
self.logger.info(f"{len(self.links)} {root}")
|
||
|
|
try:
|
||
|
|
site = requests.get(root)
|
||
|
|
tree = html.fromstring(site.content)
|
||
|
|
links = tree.xpath('//a/@href')
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
|
||
|
|
nlinks = []
|
||
|
|
for link in links:
|
||
|
|
if link not in nlinks:
|
||
|
|
if link.startswith("http"):
|
||
|
|
nlinks.append(link)
|
||
|
|
else:
|
||
|
|
nlinks.append(urljoin(site.url, link))
|
||
|
|
|
||
|
|
unchecked += nlinks
|
||
|
|
self.links[root] = nlinks
|
||
|
|
sleep(sleep_time)
|
||
|
|
|
||
|
|
def getNodesEdges(self):
|
||
|
|
nodes = []
|
||
|
|
edges = []
|
||
|
|
for key, value in self.links.items():
|
||
|
|
nodes.append(key)
|
||
|
|
for edge in value:
|
||
|
|
edges.append([key, edge])
|
||
|
|
|
||
|
|
return nodes, edges
|
||
|
|
|
||
|
|
def makeGraph(self, g):
|
||
|
|
nodes, edges = self.getNodesEdges()
|
||
|
|
for node in nodes:
|
||
|
|
g.add_node(node)
|
||
|
|
for f, t in edges:
|
||
|
|
g.add_edge(f, t)
|