2021-12-28 21:32:36 +00:00
|
|
|
from time import sleep, time
|
|
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
from lxml import html
|
|
|
|
|
from networkx.readwrite.json_graph import tree
|
|
|
|
|
import requests
|
|
|
|
|
import logging
|
|
|
|
|
import networkx as nx
|
|
|
|
|
from pyvis.network import Network
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Crawler:
|
|
|
|
|
url = "" # the url of the website to be checked
|
|
|
|
|
links = dict() # dic. with all sites and urls on those sites
|
|
|
|
|
header_values = {
|
|
|
|
|
'Connection:': 'Keep-alive',
|
|
|
|
|
'name': 'Michael Foord',
|
|
|
|
|
'location': 'Northampton',
|
|
|
|
|
'language': 'English',
|
|
|
|
|
'User-Agent': 'Mozilla 4/0'}
|
|
|
|
|
|
|
|
|
|
exclude = [
|
|
|
|
|
"login",
|
|
|
|
|
"#",
|
|
|
|
|
"share",
|
|
|
|
|
"wp-content",
|
|
|
|
|
"wprm_print",
|
|
|
|
|
"reddit",
|
|
|
|
|
"facebook",
|
|
|
|
|
"twitter",
|
|
|
|
|
"instagram",
|
|
|
|
|
"mailto",
|
|
|
|
|
'"',
|
|
|
|
|
"'"
|
|
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
def __init__(self, logger=None, exclude=None):
|
|
|
|
|
if exclude:
|
|
|
|
|
self.exclude += exclude
|
|
|
|
|
if logger:
|
|
|
|
|
self.logger = logger
|
|
|
|
|
else:
|
|
|
|
|
self.logger = logging.Logger(
|
|
|
|
|
name="star_crawler", level=logging.INFO)
|
|
|
|
|
|
|
|
|
|
def run(self, root, limit, sleep_time=0):
|
|
|
|
|
self.url = root
|
2021-12-28 23:10:10 +00:00
|
|
|
unchecked = [root]
|
2021-12-28 21:32:36 +00:00
|
|
|
|
2021-12-28 23:10:10 +00:00
|
|
|
while unchecked and len(self.links) < limit:
|
|
|
|
|
root = unchecked.pop()
|
2021-12-28 21:32:36 +00:00
|
|
|
if root in self.links or self.url.rsplit('/')[2] not in root:
|
2021-12-28 23:10:10 +00:00
|
|
|
continue
|
2021-12-28 21:32:36 +00:00
|
|
|
if "https" not in root:
|
2021-12-28 23:10:10 +00:00
|
|
|
continue
|
2021-12-29 10:07:21 +00:00
|
|
|
|
|
|
|
|
clean = False
|
2021-12-28 21:32:36 +00:00
|
|
|
for element in self.exclude:
|
|
|
|
|
if element in root:
|
2021-12-29 10:07:21 +00:00
|
|
|
clean = False
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
clean = True
|
|
|
|
|
if not clean:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
self.logger.warning(f"{len(self.links)} {root}")
|
2021-12-28 21:32:36 +00:00
|
|
|
try:
|
|
|
|
|
site = requests.get(root)
|
|
|
|
|
tree = html.fromstring(site.content)
|
|
|
|
|
links = tree.xpath('//a/@href')
|
|
|
|
|
except:
|
2021-12-28 23:10:10 +00:00
|
|
|
continue
|
2021-12-28 21:32:36 +00:00
|
|
|
|
2021-12-28 23:10:10 +00:00
|
|
|
nlinks=[]
|
2021-12-28 21:32:36 +00:00
|
|
|
for link in links:
|
|
|
|
|
if link not in nlinks:
|
|
|
|
|
if link.startswith("http"):
|
|
|
|
|
nlinks.append(link)
|
|
|
|
|
else:
|
|
|
|
|
nlinks.append(urljoin(site.url, link))
|
|
|
|
|
|
2021-12-28 23:10:10 +00:00
|
|
|
unchecked += nlinks
|
2021-12-28 21:32:36 +00:00
|
|
|
self.links[root] = nlinks
|
|
|
|
|
sleep(sleep_time)
|
|
|
|
|
|
|
|
|
|
def getNodesEdges(self):
|
|
|
|
|
nodes = []
|
|
|
|
|
edges = []
|
|
|
|
|
for key, value in self.links.items():
|
|
|
|
|
nodes.append(key)
|
|
|
|
|
for edge in value:
|
|
|
|
|
edges.append([key, edge])
|
|
|
|
|
|
|
|
|
|
return nodes, edges
|
|
|
|
|
|
|
|
|
|
def makeGraph(self, g):
|
|
|
|
|
nodes, edges = self.getNodesEdges()
|
|
|
|
|
for node in nodes:
|
|
|
|
|
g.add_node(node)
|
|
|
|
|
for f, t in edges:
|
|
|
|
|
g.add_edge(f,t)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def draw(self):
|
|
|
|
|
net = Network(directed=True, layout=False, bgcolor="black", font_color="white")
|
|
|
|
|
G = nx.DiGraph()
|
|
|
|
|
self.makeGraph(G)
|
|
|
|
|
net.from_nx(G)
|
|
|
|
|
net.height = "100%"
|
|
|
|
|
net.width = "100%"
|
|
|
|
|
net.margin = "0"
|
|
|
|
|
net.padding = "0"
|
|
|
|
|
|
|
|
|
|
net.show(os.path.join(os.path.dirname(__file__), './mygraph.html'))
|
|
|
|
|
|
|
|
|
|
|