Star-Mapper/app.py

import os
from Star import Crawler
import json
import argparse


def transformForDrawing(n, e):
    nodes = []
    drawn = []
    edges = []
    for nn in n:
        if "web.archive.org" in nn:
            continue
        label = nn.rsplit('/')[-1]
        if label == "":
            label = nn.rsplit('/')[-2]
        nodes.append({"id": nn, "label":  label, "group": 0})
        drawn.append(nn)

    for e0, e1 in e:
        if "web.archive.org" in e1:
            continue
        if e1 not in drawn and e1 not in n:
            nodes.append({"id": e1, "label": e1, "group": 1})
            drawn.append(e1)

        edges.append({"from": e0, "to": e1})

    return nodes, edges


def graph(url, limit):
    obj = Crawler()
    obj.run(url, limit)

    current = os.path.dirname(__file__)
    n, e = obj.getNodesEdges()
    with open(os.path.join(current, './cached/' + url.rsplit('/')[2] + '.json'), 'w', encoding='utf-8') as f:
        f.write(json.dumps({"nodes": n, "edges": e}))

    nodes, edges = transformForDrawing(n, e)
    return nodes, edges


def load(url):
    print("Loaded from cache: " + url)
    current = os.path.dirname(__file__)
    with open(os.path.join(current, './cached/{}.json'.format(url)),  'r', encoding='utf-8') as f:
        content = f.read()
        jsonContent = json.loads(content)
        return transformForDrawing(jsonContent["nodes"], jsonContent["edges"])


def main(url, pathToCached):
    withoutProtocol = url.split("/")[2]

    if pathToCached is not None:
        nodes, edges = graph(url, limit)
    else:
        nodes, edges = load(withoutProtocol)

    pathToTemplate = os.path.join(os.path.dirname(
        __file__), "templates", "graph.html")
    with open(pathToTemplate, "rt") as fin:
        with open(withoutProtocol + ".html", "wt") as fout:
            fout.write(fin.read().replace('{{nodes}}', json.dumps(
                nodes)).replace('{{edges}}', json.dumps(edges)))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Map any website. Only map websites you own, as this tool will open any link on a given website, which can potentially incure high costs for the owner and be interpreted as a small scale DOS attack.')
    parser.add_argument('-url', type=str, help='url to map', required=True)
    parser.add_argument('--plot-cached', type=str,
                        help='path to cached file', required=False)
    parser.add_argument(
        '-limit', type=str, help='maximum number of nodes on original site', required=False, default=5000)

    args = parser.parse_args()
    url = args.url
    pathToCached = args.plot_cached
    limit = args.limit

    main(url, pathToCached, limit)
meh 2019-04-22 15:04:52 +00:00			`import os`
renamed and rewrtoe to make iterative instead of recursive 2021-12-28 21:32:36 +00:00			`from Star import Crawler`
meh 2019-04-22 15:04:52 +00:00			`import json`
added cli, removed flask 2022-01-01 15:51:10 +00:00			`import argparse`
meh 2019-04-22 15:04:52 +00:00

fixed crawling 2021-12-28 23:10:10 +00:00			`def transformForDrawing(n, e):`
improved persitence 2019-10-04 14:53:28 +00:00			`nodes = []`
persist + edge nodes 2019-05-01 12:26:11 +00:00			`drawn = []`
slight refactor 2020-09-26 16:53:05 +00:00			`edges = []`
fixed crawling 2021-12-28 23:10:10 +00:00			`for nn in n:`
reworked template rendering to make it not suck 2021-12-29 10:07:21 +00:00			`if "web.archive.org" in nn:`
			`continue`
fixed crawling 2021-12-28 23:10:10 +00:00			`label = nn.rsplit('/')[-1]`
persist + edge nodes 2019-05-01 12:26:11 +00:00			`if label == "":`
fixed crawling 2021-12-28 23:10:10 +00:00			`label = nn.rsplit('/')[-2]`
reworked template rendering to make it not suck 2021-12-29 10:07:21 +00:00			`nodes.append({"id": nn, "label": label, "group": 0})`
fixed crawling 2021-12-28 23:10:10 +00:00			`drawn.append(nn)`

reworked template rendering to make it not suck 2021-12-29 10:07:21 +00:00			`for e0, e1 in e:`
			`if "web.archive.org" in e1:`
			`continue`
			`if e1 not in drawn and e1 not in n:`
			`nodes.append({"id": e1, "label": e1, "group": 1})`
			`drawn.append(e1)`
meh 2019-04-22 15:04:52 +00:00
reworked template rendering to make it not suck 2021-12-29 10:07:21 +00:00			`edges.append({"from": e0, "to": e1})`
meh 2019-04-22 15:04:52 +00:00
fixed crawling 2021-12-28 23:10:10 +00:00			`return nodes, edges`

added cli, removed flask 2022-01-01 15:51:10 +00:00
			`def graph(url, limit):`
fixed crawling 2021-12-28 23:10:10 +00:00			`obj = Crawler()`
added cli, removed flask 2022-01-01 15:51:10 +00:00			`obj.run(url, limit)`

fixed crawling 2021-12-28 23:10:10 +00:00			`current = os.path.dirname(__file__)`
			`n, e = obj.getNodesEdges()`
added url parameter for searched site 2020-09-26 16:26:50 +00:00			`with open(os.path.join(current, './cached/' + url.rsplit('/')[2] + '.json'), 'w', encoding='utf-8') as f:`
added cli, removed flask 2022-01-01 15:51:10 +00:00			`f.write(json.dumps({"nodes": n, "edges": e}))`
return graph 2019-04-29 09:03:24 +00:00
fixed crawling 2021-12-28 23:10:10 +00:00			`nodes, edges = transformForDrawing(n, e)`
started load func 2019-05-06 12:04:01 +00:00			`return nodes, edges`


			`def load(url):`
added url parameter for searched site 2020-09-26 16:26:50 +00:00			`print("Loaded from cache: " + url)`
			`current = os.path.dirname(__file__)`
added cli, removed flask 2022-01-01 15:51:10 +00:00			`with open(os.path.join(current, './cached/{}.json'.format(url)), 'r', encoding='utf-8') as f:`
cache works 2019-05-06 21:56:23 +00:00			`content = f.read()`
improved persitence 2019-10-04 14:53:28 +00:00			`jsonContent = json.loads(content)`
reworked template rendering to make it not suck 2021-12-29 10:07:21 +00:00			`return transformForDrawing(jsonContent["nodes"], jsonContent["edges"])`
started load func 2019-05-06 12:04:01 +00:00

added cli, removed flask 2022-01-01 15:51:10 +00:00			`def main(url, pathToCached):`
reworked template rendering to make it not suck 2021-12-29 10:07:21 +00:00			`withoutProtocol = url.split("/")[2]`
added cli, removed flask 2022-01-01 15:51:10 +00:00
			`if pathToCached is not None:`
			`nodes, edges = graph(url, limit)`
started load func 2019-05-06 12:04:01 +00:00			`else:`
cache works 2019-05-06 21:56:23 +00:00			`nodes, edges = load(withoutProtocol)`
fixed crawling 2021-12-28 23:10:10 +00:00
added cli, removed flask 2022-01-01 15:51:10 +00:00			`pathToTemplate = os.path.join(os.path.dirname(`
			`__file__), "templates", "graph.html")`
			`with open(pathToTemplate, "rt") as fin:`
			`with open(withoutProtocol + ".html", "wt") as fout:`
			`fout.write(fin.read().replace('{{nodes}}', json.dumps(`
			`nodes)).replace('{{edges}}', json.dumps(edges)))`
meh 2019-04-22 15:04:52 +00:00

			`if __name__ == '__main__':`
added cli, removed flask 2022-01-01 15:51:10 +00:00			`parser = argparse.ArgumentParser(`
			`description='Map any website. Only map websites you own, as this tool will open any link on a given website, which can potentially incure high costs for the owner and be interpreted as a small scale DOS attack.')`
			`parser.add_argument('-url', type=str, help='url to map', required=True)`
			`parser.add_argument('--plot-cached', type=str,`
			`help='path to cached file', required=False)`
			`parser.add_argument(`
			`'-limit', type=str, help='maximum number of nodes on original site', required=False, default=5000)`

			`args = parser.parse_args()`
			`url = args.url`
			`pathToCached = args.plot_cached`
			`limit = args.limit`

			`main(url, pathToCached, limit)`