renamed and rewrtoe to make iterative instead of recursive
This commit is contained in:
parent
7d5f06fc4a
commit
56b2046896
|
|
@ -1,3 +1,6 @@
|
||||||
|
|
||||||
__pycache__/
|
__pycache__/
|
||||||
.vscode/
|
.vscode/
|
||||||
|
cached/beauty.json
|
||||||
|
cached/www.budgetbytes.com.json
|
||||||
|
templates/data.js
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,111 @@
|
||||||
|
from time import sleep, time
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from lxml import html
|
||||||
|
from networkx.readwrite.json_graph import tree
|
||||||
|
import requests
|
||||||
|
import logging
|
||||||
|
import networkx as nx
|
||||||
|
from pyvis.network import Network
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class Crawler:
|
||||||
|
url = "" # the url of the website to be checked
|
||||||
|
links = dict() # dic. with all sites and urls on those sites
|
||||||
|
header_values = {
|
||||||
|
'Connection:': 'Keep-alive',
|
||||||
|
'name': 'Michael Foord',
|
||||||
|
'location': 'Northampton',
|
||||||
|
'language': 'English',
|
||||||
|
'User-Agent': 'Mozilla 4/0'}
|
||||||
|
|
||||||
|
exclude = [
|
||||||
|
"login",
|
||||||
|
"#",
|
||||||
|
"share",
|
||||||
|
"wp-content",
|
||||||
|
"wprm_print",
|
||||||
|
"reddit",
|
||||||
|
"facebook",
|
||||||
|
"twitter",
|
||||||
|
"instagram",
|
||||||
|
"mailto",
|
||||||
|
'"',
|
||||||
|
"'"
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, logger=None, exclude=None):
|
||||||
|
if exclude:
|
||||||
|
self.exclude += exclude
|
||||||
|
if logger:
|
||||||
|
self.logger = logger
|
||||||
|
else:
|
||||||
|
|
||||||
|
self.logger = logging.Logger(
|
||||||
|
name="star_crawler", level=logging.INFO)
|
||||||
|
|
||||||
|
def run(self, root, limit, sleep_time=0):
|
||||||
|
self.url = root
|
||||||
|
nlinks = [root]
|
||||||
|
|
||||||
|
while nlinks and len(self.links) < limit:
|
||||||
|
root = nlinks.pop()
|
||||||
|
if root in self.links or self.url.rsplit('/')[2] not in root:
|
||||||
|
return
|
||||||
|
if "https" not in root:
|
||||||
|
return
|
||||||
|
for element in self.exclude:
|
||||||
|
if element in root:
|
||||||
|
return
|
||||||
|
self.logger.info(root)
|
||||||
|
try:
|
||||||
|
site = requests.get(root)
|
||||||
|
tree = html.fromstring(site.content)
|
||||||
|
links = tree.xpath('//a/@href')
|
||||||
|
except:
|
||||||
|
return
|
||||||
|
|
||||||
|
nlinks = []
|
||||||
|
for link in links:
|
||||||
|
if link not in nlinks:
|
||||||
|
if link.startswith("http"):
|
||||||
|
nlinks.append(link)
|
||||||
|
else:
|
||||||
|
nlinks.append(urljoin(site.url, link))
|
||||||
|
|
||||||
|
self.links[root] = nlinks
|
||||||
|
sleep(sleep_time)
|
||||||
|
|
||||||
|
def getNodesEdges(self):
|
||||||
|
nodes = []
|
||||||
|
edges = []
|
||||||
|
for key, value in self.links.items():
|
||||||
|
nodes.append(key)
|
||||||
|
for edge in value:
|
||||||
|
edges.append([key, edge])
|
||||||
|
|
||||||
|
return nodes, edges
|
||||||
|
|
||||||
|
def makeGraph(self, g):
|
||||||
|
nodes, edges = self.getNodesEdges()
|
||||||
|
for node in nodes:
|
||||||
|
g.add_node(node)
|
||||||
|
for f, t in edges:
|
||||||
|
g.add_edge(f,t)
|
||||||
|
|
||||||
|
|
||||||
|
def draw(self):
|
||||||
|
net = Network(directed=True, layout=False, bgcolor="black", font_color="white")
|
||||||
|
G = nx.DiGraph()
|
||||||
|
self.makeGraph(G)
|
||||||
|
net.from_nx(G)
|
||||||
|
net.height = "100%"
|
||||||
|
net.width = "100%"
|
||||||
|
net.margin = "0"
|
||||||
|
net.padding = "0"
|
||||||
|
|
||||||
|
net.show(os.path.join(os.path.dirname(__file__), './mygraph.html'))
|
||||||
|
|
||||||
|
|
||||||
61
URL.py
61
URL.py
|
|
@ -1,61 +0,0 @@
|
||||||
from urllib.parse import urljoin
|
|
||||||
from lxml import html
|
|
||||||
import requests
|
|
||||||
|
|
||||||
|
|
||||||
class URL:
|
|
||||||
|
|
||||||
url = "" # the url of the website to be checked
|
|
||||||
sites = dict() # dic. with all sites and urls on those sites
|
|
||||||
header_values = {
|
|
||||||
'Connection:' : 'Keep-alive',
|
|
||||||
'name' : 'Michael Foord',
|
|
||||||
'location' : 'Northampton',
|
|
||||||
'language' : 'English',
|
|
||||||
'User-Agent': 'Mozilla 4/0'}
|
|
||||||
|
|
||||||
exclude = [
|
|
||||||
"title=Spezial",
|
|
||||||
"doodles",
|
|
||||||
"#",
|
|
||||||
"&"
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, url):
|
|
||||||
self.url = url
|
|
||||||
|
|
||||||
|
|
||||||
def run_check(self, root=None): # root is the url of the current Site
|
|
||||||
|
|
||||||
if root in self.sites or self.url.rsplit('/')[2] not in root:
|
|
||||||
#print(self.url.rsplit('/')[2])
|
|
||||||
return
|
|
||||||
if "https" not in root:
|
|
||||||
return
|
|
||||||
for element in self.exclude:
|
|
||||||
if element in root:
|
|
||||||
return
|
|
||||||
print(root)
|
|
||||||
try:
|
|
||||||
site = requests.get(root)
|
|
||||||
tree = html.fromstring(site.content)
|
|
||||||
links = tree.xpath('//a/@href')
|
|
||||||
#print(links)
|
|
||||||
except:
|
|
||||||
return
|
|
||||||
|
|
||||||
nlinks = []
|
|
||||||
for link in links:
|
|
||||||
if link not in nlinks:
|
|
||||||
if link.startswith("http"):
|
|
||||||
nlinks.append(link)
|
|
||||||
else:
|
|
||||||
nlinks.append(urljoin(site.url, link))
|
|
||||||
|
|
||||||
self.sites[root] = nlinks
|
|
||||||
|
|
||||||
for each_link in nlinks:
|
|
||||||
self.run_check(each_link)
|
|
||||||
|
|
||||||
13
app.py
13
app.py
|
|
@ -1,6 +1,6 @@
|
||||||
from flask import Flask, request, render_template
|
from flask import Flask, request, render_template
|
||||||
import os
|
import os
|
||||||
from URL import URL
|
from Star import Crawler
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
#----------------------------------------------------------------------------#
|
#----------------------------------------------------------------------------#
|
||||||
|
|
@ -11,7 +11,7 @@ app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
def graph(url):
|
def graph(url):
|
||||||
obj = URL(url)
|
obj = Crawler(url)
|
||||||
obj.run_check(url)
|
obj.run_check(url)
|
||||||
|
|
||||||
current = os.path.dirname(__file__)
|
current = os.path.dirname(__file__)
|
||||||
|
|
@ -57,10 +57,10 @@ def load(url):
|
||||||
|
|
||||||
@app.route('/')
|
@app.route('/')
|
||||||
def index():
|
def index():
|
||||||
url = request.args.get("url")
|
url = "beauty"
|
||||||
|
|
||||||
cached = os.listdir(os.path.join(os.path.dirname(__file__), "./cached"))
|
cached = os.listdir(os.path.join(os.path.dirname(__file__), "./cached"))
|
||||||
withoutProtocol = url.rsplit('/')[2]
|
withoutProtocol = url
|
||||||
if withoutProtocol + '.json' not in cached:
|
if withoutProtocol + '.json' not in cached:
|
||||||
nodes, edges = graph(url)
|
nodes, edges = graph(url)
|
||||||
else:
|
else:
|
||||||
|
|
@ -69,8 +69,9 @@ def index():
|
||||||
str1 = ","
|
str1 = ","
|
||||||
nodes = str1.join(nodes)
|
nodes = str1.join(nodes)
|
||||||
edges = str1.join(edges)
|
edges = str1.join(edges)
|
||||||
|
with open("./templates/data.js", "w") as f:
|
||||||
return render_template('graph.html', nodes = nodes, edges = edges)
|
f.write(f"var nodes={nodes}\nvar=edges={edges}")
|
||||||
|
return render_template('graph.html')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,108 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.16.1/vis.css" type="text/css" />
|
||||||
|
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.16.1/vis-network.min.js"> </script>
|
||||||
|
<center>
|
||||||
|
<h1></h1>
|
||||||
|
</center>
|
||||||
|
|
||||||
|
<!-- <link rel="stylesheet" href="../node_modules/vis/dist/vis.min.css" type="text/css" />
|
||||||
|
<script type="text/javascript" src="../node_modules/vis/dist/vis.js"> </script>-->
|
||||||
|
|
||||||
|
<style type="text/css">
|
||||||
|
|
||||||
|
#mynetwork {
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
background-color: black;
|
||||||
|
border: 1px solid lightgray;
|
||||||
|
position: relative;
|
||||||
|
float: left;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</style>
|
||||||
|
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<div id = "mynetwork"></div>
|
||||||
|
|
||||||
|
|
||||||
|
<script type="text/javascript">
|
||||||
|
|
||||||
|
// initialize global variables.
|
||||||
|
var edges;
|
||||||
|
var nodes;
|
||||||
|
var network;
|
||||||
|
var container;
|
||||||
|
var options, data;
|
||||||
|
|
||||||
|
|
||||||
|
// This method is responsible for drawing the graph, returns the drawn network
|
||||||
|
function drawGraph() {
|
||||||
|
var container = document.getElementById('mynetwork');
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// parsing and collecting nodes and edges from the python
|
||||||
|
nodes = new vis.DataSet([{"font": {"color": "white"}, "id": "https://www.patricematz.de/", "label": "https://www.patricematz.de/", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "mailto:mail@patricematz.de", "label": "mailto:mail@patricematz.de", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "label": "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill", "label": "https://github.com/Askill", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.patricematz.de/images/praktikum.pdf", "label": "https://www.patricematz.de/images/praktikum.pdf", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.patricematz.de/images/bachelor.pdf", "label": "https://www.patricematz.de/images/bachelor.pdf", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "label": "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://fs.jpmatz.de", "label": "https://fs.jpmatz.de", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/Inverse-Rezeptsuche", "label": "https://github.com/Askill/Inverse-Rezeptsuche", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://irs.projects.patricematz.de/", "label": "https://irs.projects.patricematz.de/", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/Video-Synopsis", "label": "https://github.com/Askill/Video-Synopsis", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/UI", "label": "https://github.com/Askill/UI", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/Photo-Wall", "label": "https://github.com/Askill/Photo-Wall", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.jpmatz.de/blog.html", "label": "https://www.jpmatz.de/blog.html", "shape": "dot", "size": 10}]);
|
||||||
|
edges = new vis.DataSet([{"arrows": "to", "from": "https://www.patricematz.de/", "to": "mailto:mail@patricematz.de", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.patricematz.de/images/praktikum.pdf", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.patricematz.de/images/bachelor.pdf", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://fs.jpmatz.de", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/Inverse-Rezeptsuche", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://irs.projects.patricematz.de/", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/Video-Synopsis", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/UI", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/Photo-Wall", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.jpmatz.de/blog.html", "weight": 1}]);
|
||||||
|
|
||||||
|
// adding nodes and edges to the graph
|
||||||
|
data = {nodes: nodes, edges: edges};
|
||||||
|
|
||||||
|
var options = {
|
||||||
|
"configure": {
|
||||||
|
"enabled": false
|
||||||
|
},
|
||||||
|
"edges": {
|
||||||
|
"color": {
|
||||||
|
"inherit": true
|
||||||
|
},
|
||||||
|
"smooth": {
|
||||||
|
"enabled": false,
|
||||||
|
"type": "continuous"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"interaction": {
|
||||||
|
"dragNodes": true,
|
||||||
|
"hideEdgesOnDrag": false,
|
||||||
|
"hideNodesOnDrag": false
|
||||||
|
},
|
||||||
|
"physics": {
|
||||||
|
"enabled": true,
|
||||||
|
"stabilization": {
|
||||||
|
"enabled": true,
|
||||||
|
"fit": true,
|
||||||
|
"iterations": 1000,
|
||||||
|
"onlyDynamicEdges": false,
|
||||||
|
"updateInterval": 50
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
network = new vis.Network(container, data, options);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return network;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
drawGraph();
|
||||||
|
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
Loading…
Reference in New Issue