renamed and rewrtoe to make iterative instead of recursive

This commit is contained in:
Askill 2021-12-28 22:32:36 +01:00
parent 7d5f06fc4a
commit 56b2046896
6 changed files with 235 additions and 67 deletions

3
.gitignore vendored
View File

@ -1,3 +1,6 @@
__pycache__/
.vscode/
cached/beauty.json
cached/www.budgetbytes.com.json
templates/data.js

111
Star.py Normal file
View File

@ -0,0 +1,111 @@
from time import sleep, time
from urllib.parse import urljoin
from lxml import html
from networkx.readwrite.json_graph import tree
import requests
import logging
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt
import os
class Crawler:
url = "" # the url of the website to be checked
links = dict() # dic. with all sites and urls on those sites
header_values = {
'Connection:': 'Keep-alive',
'name': 'Michael Foord',
'location': 'Northampton',
'language': 'English',
'User-Agent': 'Mozilla 4/0'}
exclude = [
"login",
"#",
"share",
"wp-content",
"wprm_print",
"reddit",
"facebook",
"twitter",
"instagram",
"mailto",
'"',
"'"
]
def __init__(self, logger=None, exclude=None):
if exclude:
self.exclude += exclude
if logger:
self.logger = logger
else:
self.logger = logging.Logger(
name="star_crawler", level=logging.INFO)
def run(self, root, limit, sleep_time=0):
self.url = root
nlinks = [root]
while nlinks and len(self.links) < limit:
root = nlinks.pop()
if root in self.links or self.url.rsplit('/')[2] not in root:
return
if "https" not in root:
return
for element in self.exclude:
if element in root:
return
self.logger.info(root)
try:
site = requests.get(root)
tree = html.fromstring(site.content)
links = tree.xpath('//a/@href')
except:
return
nlinks = []
for link in links:
if link not in nlinks:
if link.startswith("http"):
nlinks.append(link)
else:
nlinks.append(urljoin(site.url, link))
self.links[root] = nlinks
sleep(sleep_time)
def getNodesEdges(self):
nodes = []
edges = []
for key, value in self.links.items():
nodes.append(key)
for edge in value:
edges.append([key, edge])
return nodes, edges
def makeGraph(self, g):
nodes, edges = self.getNodesEdges()
for node in nodes:
g.add_node(node)
for f, t in edges:
g.add_edge(f,t)
def draw(self):
net = Network(directed=True, layout=False, bgcolor="black", font_color="white")
G = nx.DiGraph()
self.makeGraph(G)
net.from_nx(G)
net.height = "100%"
net.width = "100%"
net.margin = "0"
net.padding = "0"
net.show(os.path.join(os.path.dirname(__file__), './mygraph.html'))

61
URL.py
View File

@ -1,61 +0,0 @@
from urllib.parse import urljoin
from lxml import html
import requests
class URL:
url = "" # the url of the website to be checked
sites = dict() # dic. with all sites and urls on those sites
header_values = {
'Connection:' : 'Keep-alive',
'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'English',
'User-Agent': 'Mozilla 4/0'}
exclude = [
"title=Spezial",
"doodles",
"#",
"&"
]
def __init__(self, url):
self.url = url
def run_check(self, root=None): # root is the url of the current Site
if root in self.sites or self.url.rsplit('/')[2] not in root:
#print(self.url.rsplit('/')[2])
return
if "https" not in root:
return
for element in self.exclude:
if element in root:
return
print(root)
try:
site = requests.get(root)
tree = html.fromstring(site.content)
links = tree.xpath('//a/@href')
#print(links)
except:
return
nlinks = []
for link in links:
if link not in nlinks:
if link.startswith("http"):
nlinks.append(link)
else:
nlinks.append(urljoin(site.url, link))
self.sites[root] = nlinks
for each_link in nlinks:
self.run_check(each_link)

13
app.py
View File

@ -1,6 +1,6 @@
from flask import Flask, request, render_template
import os
from URL import URL
from Star import Crawler
import json
import sys
#----------------------------------------------------------------------------#
@ -11,7 +11,7 @@ app = Flask(__name__)
def graph(url):
obj = URL(url)
obj = Crawler(url)
obj.run_check(url)
current = os.path.dirname(__file__)
@ -57,10 +57,10 @@ def load(url):
@app.route('/')
def index():
url = request.args.get("url")
url = "beauty"
cached = os.listdir(os.path.join(os.path.dirname(__file__), "./cached"))
withoutProtocol = url.rsplit('/')[2]
withoutProtocol = url
if withoutProtocol + '.json' not in cached:
nodes, edges = graph(url)
else:
@ -69,8 +69,9 @@ def index():
str1 = ","
nodes = str1.join(nodes)
edges = str1.join(edges)
return render_template('graph.html', nodes = nodes, edges = edges)
with open("./templates/data.js", "w") as f:
f.write(f"var nodes={nodes}\nvar=edges={edges}")
return render_template('graph.html')
if __name__ == '__main__':

108
mygraph.html Normal file
View File

@ -0,0 +1,108 @@
<html>
<head>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.16.1/vis.css" type="text/css" />
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.16.1/vis-network.min.js"> </script>
<center>
<h1></h1>
</center>
<!-- <link rel="stylesheet" href="../node_modules/vis/dist/vis.min.css" type="text/css" />
<script type="text/javascript" src="../node_modules/vis/dist/vis.js"> </script>-->
<style type="text/css">
#mynetwork {
width: 100%;
height: 100%;
background-color: black;
border: 1px solid lightgray;
position: relative;
float: left;
}
</style>
</head>
<body>
<div id = "mynetwork"></div>
<script type="text/javascript">
// initialize global variables.
var edges;
var nodes;
var network;
var container;
var options, data;
// This method is responsible for drawing the graph, returns the drawn network
function drawGraph() {
var container = document.getElementById('mynetwork');
// parsing and collecting nodes and edges from the python
nodes = new vis.DataSet([{"font": {"color": "white"}, "id": "https://www.patricematz.de/", "label": "https://www.patricematz.de/", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "mailto:mail@patricematz.de", "label": "mailto:mail@patricematz.de", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "label": "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill", "label": "https://github.com/Askill", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.patricematz.de/images/praktikum.pdf", "label": "https://www.patricematz.de/images/praktikum.pdf", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.patricematz.de/images/bachelor.pdf", "label": "https://www.patricematz.de/images/bachelor.pdf", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "label": "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://fs.jpmatz.de", "label": "https://fs.jpmatz.de", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/Inverse-Rezeptsuche", "label": "https://github.com/Askill/Inverse-Rezeptsuche", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://irs.projects.patricematz.de/", "label": "https://irs.projects.patricematz.de/", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/Video-Synopsis", "label": "https://github.com/Askill/Video-Synopsis", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/UI", "label": "https://github.com/Askill/UI", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://github.com/Askill/Photo-Wall", "label": "https://github.com/Askill/Photo-Wall", "shape": "dot", "size": 10}, {"font": {"color": "white"}, "id": "https://www.jpmatz.de/blog.html", "label": "https://www.jpmatz.de/blog.html", "shape": "dot", "size": 10}]);
edges = new vis.DataSet([{"arrows": "to", "from": "https://www.patricematz.de/", "to": "mailto:mail@patricematz.de", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.patricematz.de/images/praktikum.pdf", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.patricematz.de/images/bachelor.pdf", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://fs.jpmatz.de", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/Inverse-Rezeptsuche", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://irs.projects.patricematz.de/", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/Video-Synopsis", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/UI", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://github.com/Askill/Photo-Wall", "weight": 1}, {"arrows": "to", "from": "https://www.patricematz.de/", "to": "https://www.jpmatz.de/blog.html", "weight": 1}]);
// adding nodes and edges to the graph
data = {nodes: nodes, edges: edges};
var options = {
"configure": {
"enabled": false
},
"edges": {
"color": {
"inherit": true
},
"smooth": {
"enabled": false,
"type": "continuous"
}
},
"interaction": {
"dragNodes": true,
"hideEdgesOnDrag": false,
"hideNodesOnDrag": false
},
"physics": {
"enabled": true,
"stabilization": {
"enabled": true,
"fit": true,
"iterations": 1000,
"onlyDynamicEdges": false,
"updateInterval": 50
}
}
};
network = new vis.Network(container, data, options);
return network;
}
drawGraph();
</script>
</body>
</html>

6
x.py Normal file
View File

@ -0,0 +1,6 @@
import Star
crawler = Star.Crawler()
crawler.run("https://www.patricematz.de/", 5000)
print(crawler.getNodesEdges())
crawler.draw()