2019-04-22 15:04:52 +00:00
import os
2021-12-28 21:32:36 +00:00
from Star import Crawler
2019-04-22 15:04:52 +00:00
import json
2022-01-01 15:51:10 +00:00
import argparse
2019-04-22 15:04:52 +00:00
2021-12-28 23:10:10 +00:00
def transformForDrawing ( n , e ) :
2019-10-04 14:53:28 +00:00
nodes = [ ]
2019-05-01 12:26:11 +00:00
drawn = [ ]
2020-09-26 16:53:05 +00:00
edges = [ ]
2021-12-28 23:10:10 +00:00
for nn in n :
2021-12-29 10:07:21 +00:00
if " web.archive.org " in nn :
continue
2021-12-28 23:10:10 +00:00
label = nn . rsplit ( ' / ' ) [ - 1 ]
2019-05-01 12:26:11 +00:00
if label == " " :
2021-12-28 23:10:10 +00:00
label = nn . rsplit ( ' / ' ) [ - 2 ]
2021-12-29 10:07:21 +00:00
nodes . append ( { " id " : nn , " label " : label , " group " : 0 } )
2021-12-28 23:10:10 +00:00
drawn . append ( nn )
2021-12-29 10:07:21 +00:00
for e0 , e1 in e :
if " web.archive.org " in e1 :
continue
if e1 not in drawn and e1 not in n :
nodes . append ( { " id " : e1 , " label " : e1 , " group " : 1 } )
drawn . append ( e1 )
2019-04-22 15:04:52 +00:00
2021-12-29 10:07:21 +00:00
edges . append ( { " from " : e0 , " to " : e1 } )
2019-04-22 15:04:52 +00:00
2021-12-28 23:10:10 +00:00
return nodes , edges
2022-01-01 15:51:10 +00:00
def graph ( url , limit ) :
2021-12-28 23:10:10 +00:00
obj = Crawler ( )
2022-01-01 15:51:10 +00:00
obj . run ( url , limit )
2021-12-28 23:10:10 +00:00
current = os . path . dirname ( __file__ )
n , e = obj . getNodesEdges ( )
2020-09-26 16:26:50 +00:00
with open ( os . path . join ( current , ' ./cached/ ' + url . rsplit ( ' / ' ) [ 2 ] + ' .json ' ) , ' w ' , encoding = ' utf-8 ' ) as f :
2022-01-01 15:51:10 +00:00
f . write ( json . dumps ( { " nodes " : n , " edges " : e } ) )
2019-04-29 09:03:24 +00:00
2021-12-28 23:10:10 +00:00
nodes , edges = transformForDrawing ( n , e )
2019-05-06 12:04:01 +00:00
return nodes , edges
def load ( url ) :
2020-09-26 16:26:50 +00:00
print ( " Loaded from cache: " + url )
current = os . path . dirname ( __file__ )
2022-01-01 15:51:10 +00:00
with open ( os . path . join ( current , ' ./cached/ {} .json ' . format ( url ) ) , ' r ' , encoding = ' utf-8 ' ) as f :
2019-05-06 21:56:23 +00:00
content = f . read ( )
2019-10-04 14:53:28 +00:00
jsonContent = json . loads ( content )
2021-12-29 10:07:21 +00:00
return transformForDrawing ( jsonContent [ " nodes " ] , jsonContent [ " edges " ] )
2019-05-06 12:04:01 +00:00
2022-01-01 15:51:10 +00:00
def main ( url , pathToCached ) :
2021-12-29 10:07:21 +00:00
withoutProtocol = url . split ( " / " ) [ 2 ]
2022-01-01 15:51:10 +00:00
if pathToCached is not None :
nodes , edges = graph ( url , limit )
2019-05-06 12:04:01 +00:00
else :
2019-05-06 21:56:23 +00:00
nodes , edges = load ( withoutProtocol )
2021-12-28 23:10:10 +00:00
2022-01-01 15:51:10 +00:00
pathToTemplate = os . path . join ( os . path . dirname (
__file__ ) , " templates " , " graph.html " )
with open ( pathToTemplate , " rt " ) as fin :
with open ( withoutProtocol + " .html " , " wt " ) as fout :
fout . write ( fin . read ( ) . replace ( ' {{ nodes}} ' , json . dumps (
nodes ) ) . replace ( ' {{ edges}} ' , json . dumps ( edges ) ) )
2019-04-22 15:04:52 +00:00
if __name__ == ' __main__ ' :
2022-01-01 15:51:10 +00:00
parser = argparse . ArgumentParser (
description = ' Map any website. Only map websites you own, as this tool will open any link on a given website, which can potentially incure high costs for the owner and be interpreted as a small scale DOS attack. ' )
parser . add_argument ( ' -url ' , type = str , help = ' url to map ' , required = True )
parser . add_argument ( ' --plot-cached ' , type = str ,
help = ' path to cached file ' , required = False )
parser . add_argument (
' -limit ' , type = str , help = ' maximum number of nodes on original site ' , required = False , default = 5000 )
args = parser . parse_args ( )
url = args . url
pathToCached = args . plot_cached
limit = args . limit
main ( url , pathToCached , limit )