diff --git a/Star.py b/Star.py index 52b87fe..eb01a1c 100644 --- a/Star.py +++ b/Star.py @@ -1,13 +1,8 @@ -from time import sleep, time +from time import sleep from urllib.parse import urljoin from lxml import html -from networkx.readwrite.json_graph import tree import requests import logging -import networkx as nx -from pyvis.network import Network -import matplotlib.pyplot as plt -import os class Crawler: @@ -74,7 +69,7 @@ class Crawler: except: continue - nlinks=[] + nlinks = [] for link in links: if link not in nlinks: if link.startswith("http"): @@ -101,19 +96,4 @@ class Crawler: for node in nodes: g.add_node(node) for f, t in edges: - g.add_edge(f,t) - - - def draw(self): - net = Network(directed=True, layout=False, bgcolor="black", font_color="white") - G = nx.DiGraph() - self.makeGraph(G) - net.from_nx(G) - net.height = "100%" - net.width = "100%" - net.margin = "0" - net.padding = "0" - - net.show(os.path.join(os.path.dirname(__file__), './mygraph.html')) - - + g.add_edge(f, t) diff --git a/app.py b/app.py index 9cf3335..dd967ca 100644 --- a/app.py +++ b/app.py @@ -1,13 +1,8 @@ -from flask import Flask, request, render_template import os from Star import Crawler import json -import sys -#----------------------------------------------------------------------------# -# App Config. -#----------------------------------------------------------------------------# +import argparse -app = Flask(__name__) def transformForDrawing(n, e): nodes = [] @@ -33,14 +28,15 @@ def transformForDrawing(n, e): return nodes, edges -def graph(url): + +def graph(url, limit): obj = Crawler() - obj.run(url, 5000) - + obj.run(url, limit) + current = os.path.dirname(__file__) n, e = obj.getNodesEdges() with open(os.path.join(current, './cached/' + url.rsplit('/')[2] + '.json'), 'w', encoding='utf-8') as f: - f.write(json.dumps({"nodes": n,"edges": e})) + f.write(json.dumps({"nodes": n, "edges": e})) nodes, edges = transformForDrawing(n, e) return nodes, edges @@ -49,37 +45,40 @@ def graph(url): def load(url): print("Loaded from cache: " + url) current = os.path.dirname(__file__) - with open(os.path.join(current,'./cached/{}.json'.format(url)), 'r', encoding='utf-8') as f: + with open(os.path.join(current, './cached/{}.json'.format(url)), 'r', encoding='utf-8') as f: content = f.read() jsonContent = json.loads(content) return transformForDrawing(jsonContent["nodes"], jsonContent["edges"]) -#----------------------------------------------------------------------------# -# Controllers. -#----------------------------------------------------------------------------# -# input for urls over url -@app.route('/') -def index(): - url = request.args.get("url") - cached = os.listdir(os.path.join(os.path.dirname(__file__), "./cached")) +def main(url, pathToCached): withoutProtocol = url.split("/")[2] - if withoutProtocol + '.json' not in cached: - nodes, edges = graph(url) + + if pathToCached is not None: + nodes, edges = graph(url, limit) else: nodes, edges = load(withoutProtocol) - - print(url) - return render_template('graph.html', nodes = json.dumps(nodes), edges = json.dumps(edges)) + pathToTemplate = os.path.join(os.path.dirname( + __file__), "templates", "graph.html") + with open(pathToTemplate, "rt") as fin: + with open(withoutProtocol + ".html", "wt") as fout: + fout.write(fin.read().replace('{{nodes}}', json.dumps( + nodes)).replace('{{edges}}', json.dumps(edges))) if __name__ == '__main__': - port = int(os.environ.get('PORT', 80)) - app.run(host='0.0.0.0', port=port) - - - - + parser = argparse.ArgumentParser( + description='Map any website. Only map websites you own, as this tool will open any link on a given website, which can potentially incure high costs for the owner and be interpreted as a small scale DOS attack.') + parser.add_argument('-url', type=str, help='url to map', required=True) + parser.add_argument('--plot-cached', type=str, + help='path to cached file', required=False) + parser.add_argument( + '-limit', type=str, help='maximum number of nodes on original site', required=False, default=5000) + args = parser.parse_args() + url = args.url + pathToCached = args.plot_cached + limit = args.limit + main(url, pathToCached, limit) diff --git a/requirements.txt b/requirements.txt index abed0d5..2c1bc47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -Flask-WTF requests - +lxml +urllib3 diff --git a/templates/graph.html b/templates/graph.html index 546f74a..69db186 100644 --- a/templates/graph.html +++ b/templates/graph.html @@ -101,9 +101,9 @@ function draw() { var color = 'gray'; - var nodes = {{ nodes | safe }} ; + var nodes = {{nodes}} ; - var edges = {{ edges | safe }} ; + var edges = {{edges}} ; // create a network var container = document.getElementById('mynetwork'); var data = { diff --git a/x.py b/x.py deleted file mode 100644 index 7dca8d8..0000000 --- a/x.py +++ /dev/null @@ -1,6 +0,0 @@ -import Star - -crawler = Star.Crawler() -crawler.run("https://www.google.de/", 5000) -print(crawler.getNodesEdges()) -crawler.draw() \ No newline at end of file