renamed and rewrtoe to make iterative instead of recursive
This commit is contained in:
parent
7d5f06fc4a
commit
56b2046896
|
|
@ -1,3 +1,6 @@
|
|||
|
||||
__pycache__/
|
||||
.vscode/
|
||||
cached/beauty.json
|
||||
cached/www.budgetbytes.com.json
|
||||
templates/data.js
|
||||
|
|
|
|||
|
|
@ -0,0 +1,111 @@
|
|||
from time import sleep, time
|
||||
from urllib.parse import urljoin
|
||||
from lxml import html
|
||||
from networkx.readwrite.json_graph import tree
|
||||
import requests
|
||||
import logging
|
||||
import networkx as nx
|
||||
from pyvis.network import Network
|
||||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
|
||||
|
||||
class Crawler:
|
||||
url = "" # the url of the website to be checked
|
||||
links = dict() # dic. with all sites and urls on those sites
|
||||
header_values = {
|
||||
'Connection:': 'Keep-alive',
|
||||
'name': 'Michael Foord',
|
||||
'location': 'Northampton',
|
||||
'language': 'English',
|
||||
'User-Agent': 'Mozilla 4/0'}
|
||||
|
||||
exclude = [
|
||||
"login",
|
||||
"#",
|
||||
"share",
|
||||
"wp-content",
|
||||
"wprm_print",
|
||||
"reddit",
|
||||
"facebook",
|
||||
"twitter",
|
||||
"instagram",
|
||||
"mailto",
|
||||
'"',
|
||||
"'"
|
||||
|
||||
]
|
||||
|
||||
def __init__(self, logger=None, exclude=None):
|
||||
if exclude:
|
||||
self.exclude += exclude
|
||||
if logger:
|
||||
self.logger = logger
|
||||
else:
|
||||
|
||||
self.logger = logging.Logger(
|
||||
name="star_crawler", level=logging.INFO)
|
||||
|
||||
def run(self, root, limit, sleep_time=0):
|
||||
self.url = root
|
||||
nlinks = [root]
|
||||
|
||||
while nlinks and len(self.links) < limit:
|
||||
root = nlinks.pop()
|
||||
if root in self.links or self.url.rsplit('/')[2] not in root:
|
||||
return
|
||||
if "https" not in root:
|
||||
return
|
||||
for element in self.exclude:
|
||||
if element in root:
|
||||
return
|
||||
self.logger.info(root)
|
||||
try:
|
||||
site = requests.get(root)
|
||||
tree = html.fromstring(site.content)
|
||||
links = tree.xpath('//a/@href')
|
||||
except:
|
||||
return
|
||||
|
||||
nlinks = []
|
||||
for link in links:
|
||||
if link not in nlinks:
|
||||
if link.startswith("http"):
|
||||
nlinks.append(link)
|
||||
else:
|
||||
nlinks.append(urljoin(site.url, link))
|
||||
|
||||
self.links[root] = nlinks
|
||||
sleep(sleep_time)
|
||||
|
||||
def getNodesEdges(self):
|
||||
nodes = []
|
||||
edges = []
|
||||
for key, value in self.links.items():
|
||||
nodes.append(key)
|
||||
for edge in value:
|
||||
edges.append([key, edge])
|
||||
|
||||
return nodes, edges
|
||||
|
||||
def makeGraph(self, g):
|
||||
nodes, edges = self.getNodesEdges()
|
||||
for node in nodes:
|
||||
g.add_node(node)
|
||||
for f, t in edges:
|
||||
g.add_edge(f,t)
|
||||
|
||||
|
||||
def draw(self):
|
||||
net = Network(directed=True, layout=False, bgcolor="black", font_color="white")
|
||||
G = nx.DiGraph()
|
||||
self.makeGraph(G)
|
||||
net.from_nx(G)
|
||||
net.height = "100%"
|
||||
net.width = "100%"
|
||||
net.margin = "0"
|
||||
net.padding = "0"
|
||||
|
||||
net.show(os.path.join(os.path.dirname(__file__), './mygraph.html'))
|
||||
|
||||
|
||||
61
URL.py
61
URL.py
|
|
@ -1,61 +0,0 @@
|
|||
from urllib.parse import urljoin
|
||||
from lxml import html
|
||||
import requests
|
||||
|
||||
|
||||
class URL:
|
||||
|
||||
url = "" # the url of the website to be checked
|
||||
sites = dict() # dic. with all sites and urls on those sites
|
||||
header_values = {
|
||||
'Connection:' : 'Keep-alive',
|
||||
'name' : 'Michael Foord',
|
||||
'location' : 'Northampton',
|
||||
'language' : 'English',
|
||||
'User-Agent': 'Mozilla 4/0'}
|
||||
|
||||
exclude = [
|
||||
"title=Spezial",
|
||||
"doodles",
|
||||
"#",
|
||||
"&"
|
||||
]
|
||||
|
||||
|
||||
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
|
||||
|
||||
def run_check(self, root=None): # root is the url of the current Site
|
||||
|
||||
if root in self.sites or self.url.rsplit('/')[2] not in root:
|
||||
#print(self.url.rsplit('/')[2])
|
||||
return
|
||||
if "https" not in root:
|
||||
return
|
||||
for element in self.exclude:
|
||||
if element in root:
|
||||
return
|
||||
print(root)
|
||||
try:
|
||||
site = requests.get(root)
|
||||
tree = html.fromstring(site.content)
|
||||
links = tree.xpath('//a/@href')
|
||||
#print(links)
|
||||
except:
|
||||
return
|
||||
|
||||
nlinks = []
|
||||
for link in links:
|
||||
if link not in nlinks:
|
||||
if link.startswith("http"):
|
||||
nlinks.append(link)
|
||||
else:
|
||||
nlinks.append(urljoin(site.url, link))
|
||||
|
||||
self.sites[root] = nlinks
|
||||
|
||||
for each_link in nlinks:
|
||||
self.run_check(each_link)
|
||||
|
||||
13
app.py
13
app.py
|
|
@ -1,6 +1,6 @@
|
|||
from flask import Flask, request, render_template
|
||||
import os
|
||||
from URL import URL
|
||||
from Star import Crawler
|
||||
import json
|
||||
import sys
|
||||
#----------------------------------------------------------------------------#
|
||||
|
|
@ -11,7 +11,7 @@ app = Flask(__name__)
|
|||
|
||||
|
||||
def graph(url):
|
||||
obj = URL(url)
|
||||
obj = Crawler(url)
|
||||
obj.run_check(url)
|
||||
|
||||
current = os.path.dirname(__file__)
|
||||
|
|
@ -57,10 +57,10 @@ def load(url):
|
|||
|
||||
@app.route('/')
|
||||
def index():
|
||||
url = request.args.get("url")
|
||||
url = "beauty"
|
||||
|
||||
cached = os.listdir(os.path.join(os.path.dirname(__file__), "./cached"))
|
||||
withoutProtocol = url.rsplit('/')[2]
|
||||
withoutProtocol = url
|
||||
if withoutProtocol + '.json' not in cached:
|
||||
nodes, edges = graph(url)
|
||||
else:
|
||||
|
|
@ -69,8 +69,9 @@ def index():
|
|||
str1 = ","
|
||||
nodes = str1.join(nodes)
|
||||
edges = str1.join(edges)
|
||||
|
||||
return render_template('graph.html', nodes = nodes, edges = edges)
|
||||
with open("./templates/data.js", "w") as f:
|
||||
f.write(f"var nodes={nodes}\nvar=edges={edges}")
|
||||
return render_template('graph.html')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
|
|
@ -0,0 +1,108 @@
|
|||
<html>
|
||||
<head>
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.16.1/vis.css" type="text/css" />
|
||||
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.16.1/vis-network.min.js"> </script>
|
||||
<center>
|
||||
<h1></h1>
|
||||
</center>
|
||||
|
||||
<!-- <link rel="stylesheet" href="../node_modules/vis/dist/vis.min.css" type="text/css" />
|
||||
<script type="text/javascript" src="../node_modules/vis/dist/vis.js"> </script>-->
|
||||
|
||||
<style type="text/css">
|
||||
|
||||
#mynetwork {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: black;
|
||||
border: 1px solid lightgray;
|
||||
position: relative;
|
||||
float: left;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</style>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id = "mynetwork"></div>
|
||||
|
||||
|
||||
<script type="text/javascript">
|
||||
|
||||
// initialize global variables.
|
||||
var edges;
|
||||
var nodes;
|
||||
var network;
|
||||
var container;
|
||||
var options, data;
|
||||
|
||||
|
||||
// This method is responsible for drawing the graph, returns the drawn network
|
||||
function drawGraph() {
|
||||
var container = document.getElementById('mynetwork');
|
||||
|
||||
|
||||
|
||||
// parsing and collecting nodes and edges from the python
|
||||
nodes = new vis.DataSet([{"font": {"color": "white"}, "id": "https://www.patricematz.de/", "label": "https://www.patricematz.de/", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "mailto:mail@patricematz.de", "label": "mailto:mail@patricematz.de", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "label": "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill", "label": "https://github.com/Askill", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.patricematz.de/images/praktikum.pdf", "label": "https://www.patricematz.de/images/praktikum.pdf", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.patricematz.de/images/bachelor.pdf", "label": "https://www.patricematz.de/images/bachelor.pdf", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "label": "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://fs.jpmatz.de", "label": "https://fs.jpmatz.de", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/Inverse-Rezeptsuche", "label": "https://github.com/Askill/Inverse-Rezeptsuche", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://irs.projects.patricematz.de/", "label": "https://irs.projects.patricematz.de/", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/Video-Synopsis", "label": "https://github.com/Askill/Video-Synopsis", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/UI", "label": "https://github.com/Askill/UI", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/Photo-Wall", "label": "https://github.com/Askill/Photo-Wall", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.jpmatz.de/blog.html", "label": "https://www.jpmatz.de/blog.html", "shape": "dot", "size": 10}]);
|
||||
edges = new vis.DataSet([{"arrows": "to", "from": "https://www.patricematz.de/", "to": "mailto:mail@patricematz.de", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.patricematz.de/images/praktikum.pdf", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.patricematz.de/images/bachelor.pdf", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://fs.jpmatz.de", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/Inverse-Rezeptsuche", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://irs.projects.patricematz.de/", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/Video-Synopsis", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/UI", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/Photo-Wall", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.jpmatz.de/blog.html", "weight": 1}]);
|
||||
|
||||
// adding nodes and edges to the graph
|
||||
data = {nodes: nodes, edges: edges};
|
||||
|
||||
var options = {
|
||||
"configure": {
|
||||
"enabled": false
|
||||
},
|
||||
"edges": {
|
||||
"color": {
|
||||
"inherit": true
|
||||
},
|
||||
"smooth": {
|
||||
"enabled": false,
|
||||
"type": "continuous"
|
||||
}
|
||||
},
|
||||
"interaction": {
|
||||
"dragNodes": true,
|
||||
"hideEdgesOnDrag": false,
|
||||
"hideNodesOnDrag": false
|
||||
},
|
||||
"physics": {
|
||||
"enabled": true,
|
||||
"stabilization": {
|
||||
"enabled": true,
|
||||
"fit": true,
|
||||
"iterations": 1000,
|
||||
"onlyDynamicEdges": false,
|
||||
"updateInterval": 50
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
network = new vis.Network(container, data, options);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return network;
|
||||
|
||||
}
|
||||
|
||||
drawGraph();
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Loading…
Reference in New Issue