fixed crawling

This commit is contained in:
Askill 2021-12-29 00:10:10 +01:00
parent 56b2046896
commit 2ac3dd62e9
6 changed files with 36 additions and 36 deletions

18
Star.py
View File

@ -42,32 +42,31 @@ class Crawler:
if logger:
self.logger = logger
else:
self.logger = logging.Logger(
name="star_crawler", level=logging.INFO)
def run(self, root, limit, sleep_time=0):
self.url = root
nlinks = [root]
unchecked = [root]
while nlinks and len(self.links) < limit:
root = nlinks.pop()
while unchecked and len(self.links) < limit:
root = unchecked.pop()
if root in self.links or self.url.rsplit('/')[2] not in root:
return
continue
if "https" not in root:
return
continue
for element in self.exclude:
if element in root:
return
continue
self.logger.info(root)
try:
site = requests.get(root)
tree = html.fromstring(site.content)
links = tree.xpath('//a/@href')
except:
return
continue
nlinks = []
nlinks=[]
for link in links:
if link not in nlinks:
if link.startswith("http"):
@ -75,6 +74,7 @@ class Crawler:
else:
nlinks.append(urljoin(site.url, link))
unchecked += nlinks
self.links[root] = nlinks
sleep(sleep_time)

49
app.py
View File

@ -9,34 +9,36 @@ import sys
app = Flask(__name__)
def graph(url):
obj = Crawler(url)
obj.run_check(url)
current = os.path.dirname(__file__)
def transformForDrawing(n, e):
nodes = []
drawn = []
edges = []
for key, values in obj.sites.items():
label = key.rsplit('/')[-1]
for nn in n:
label = nn.rsplit('/')[-1]
if label == "":
label = key.rsplit('/')[-2]
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(key, label, 0) + '}')
drawn.append(key)
label = nn.rsplit('/')[-2]
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(nn, label, 0) + '}\n')
drawn.append(nn)
for value in values:
if value not in drawn and value not in obj.sites:
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(value, value, 1) + '}')
drawn.append(value)
for ee in e:
if ee[1] not in drawn and ee[1] not in n:
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(ee[1], ee[1], 1) + '}\n')
drawn.append(ee[1])
for value in values:
edges.append('{' + "from: '{}', to: '{}'".format(key, value) + '}')
edges.append('{' + "from: '{}', to: '{}'".format(ee[0], ee[1]) + '}\n')
return nodes, edges
def graph(url):
obj = Crawler()
obj.run(url, 5000)
current = os.path.dirname(__file__)
n, e = obj.getNodesEdges()
with open(os.path.join(current, './cached/' + url.rsplit('/')[2] + '.json'), 'w', encoding='utf-8') as f:
f.write(json.dumps({"nodes": nodes,"edges": edges}))
f.write(json.dumps({"nodes": n,"edges": e}))
nodes, edges = transformForDrawing(n, e)
return nodes, edges
@ -48,6 +50,7 @@ def load(url):
jsonContent = json.loads(content)
nodes = jsonContent["nodes"]
edges = jsonContent["edges"]
nodes, edges = transformForDrawing(nodes, edges)
return nodes, edges
#----------------------------------------------------------------------------#
@ -57,8 +60,7 @@ def load(url):
@app.route('/')
def index():
url = "beauty"
url = request.args.get("url")
cached = os.listdir(os.path.join(os.path.dirname(__file__), "./cached"))
withoutProtocol = url
if withoutProtocol + '.json' not in cached:
@ -69,9 +71,8 @@ def index():
str1 = ","
nodes = str1.join(nodes)
edges = str1.join(edges)
with open("./templates/data.js", "w") as f:
f.write(f"var nodes={nodes}\nvar=edges={edges}")
return render_template('graph.html')
return render_template('graph.html', nodes = nodes, edges = edges)
if __name__ == '__main__':

View File

@ -1 +0,0 @@
{"nodes": ["{id: 'https://budgetbytes.lpages.co/budget-bytes-app/', label: 'budget-bytes-app', group: 0}", "{id: 'https://itunes.apple.com/app/apple-store/id1240079167?pt=95954917&ct=Landing%20page&mt=8', label: 'https://itunes.apple.com/app/apple-store/id1240079167?pt=95954917&ct=Landing%20page&mt=8', group: 1}", "{id: 'https://goo.gl/wiZ4FQ', label: 'https://goo.gl/wiZ4FQ', group: 1}", "{id: 'mailto:budgetbytesapp@sidechef.com', label: 'mailto:budgetbytesapp@sidechef.com', group: 1}", "{id: 'https://www.budgetbytes.com/', label: 'https://www.budgetbytes.com/', group: 1}", "{id: 'https://www.sidechef.com/', label: 'https://www.sidechef.com/', group: 1}"], "edges": ["{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://itunes.apple.com/app/apple-store/id1240079167?pt=95954917&ct=Landing%20page&mt=8'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://goo.gl/wiZ4FQ'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'mailto:budgetbytesapp@sidechef.com'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://www.budgetbytes.com/'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://www.sidechef.com/'}"]}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,7 +1,7 @@
<html>
<script src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.js"></script>
<link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.css">
<div id="mynetwork" style = "background-color: grey;"></div>
<div id="mynetwork" style = "background-color: rgb(39, 39, 39);"></div>
<script type="text/javascript">
var color = 'gray';