added package stuff

This commit is contained in:
Askill 2022-01-01 23:43:11 +01:00
parent 3c33441380
commit 6049622207
9 changed files with 93 additions and 15 deletions

7
.gitignore vendored
View File

@ -4,3 +4,10 @@ __pycache__/
cached/beauty.json cached/beauty.json
cached/www.budgetbytes.com.json cached/www.budgetbytes.com.json
templates/data.js templates/data.js
dist/
*.egg-info
VERSION
**/__meta__.py
cached/visjs.github.io.json
cached/www.dinneratthezoo.com.json
cached/www.patricematz.de.json

View File

@ -4,6 +4,7 @@ Calls every link on a given website and produces an explorable graph visualizati
Please note that the graph layout can take a long time since it is JS based. Loading a graph with 3000 Nodes may take 5 minutes or more. Please note that the graph layout can take a long time since it is JS based. Loading a graph with 3000 Nodes may take 5 minutes or more.
```
Map any website. Only map websites you own, as this tool will open any link on a given Map any website. Only map websites you own, as this tool will open any link on a given
website, which can potentially incure high costs for the owner and be interpreted website, which can potentially incure high costs for the owner and be interpreted
as a small scale DOS attack. as a small scale DOS attack.
@ -13,7 +14,8 @@ Please note that the graph layout can take a long time since it is JS based. Loa
-url url to map -url url to map
--plot-cached path to cached file --plot-cached path to cached file
-limit maximum number of nodes on original site -limit maximum number of nodes on original site
```
## Examples: ## Examples:
### Google.de: ### Google.de:
![google.de](./google.png) ![google.de](./docs/google.png)

8
docs/LICENSE.txt Normal file
View File

@ -0,0 +1,8 @@
Copyright (c) 2021 Patrice Matz
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

Before

Width:  |  Height:  |  Size: 2.3 MiB

After

Width:  |  Height:  |  Size: 2.3 MiB

63
setup.py Normal file
View File

@ -0,0 +1,63 @@
# -*- coding: utf8 -*-
#
# This file were created by Python Boilerplate. Use Python Boilerplate to start
# simple, usable and best-practices compliant Python projects.
#
# Learn more about it at: http://github.com/fabiommendes/python-boilerplate/
#
import os
from setuptools import setup, find_packages
# Meta information
version = open('VERSION').read().strip()
dirname = os.path.dirname(__file__)
# Save version and author to __meta__.py
path = os.path.join(dirname, 'src', 'Star-Mapper', '__meta__.py')
data = '''# Automatically created. Please do not edit.
__version__ = u'%s'
__author__ = u'Patrice Matz'
''' % version
with open(path, 'wb') as F:
F.write(data.encode())
setup(
# Basic info
name='Star-Mapper',
version=version,
author='Patrice Matz',
author_email='mail@patricematz.de',
url='https://github.com/Askill/Star-Mapper',
description='Calls every link on a given website and produces an explorable graph visualization.',
long_description=open('./docs/README.md').read(),
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Operating System :: POSIX',
'Programming Language :: Python',
'Topic :: Software Development :: Libraries',
],
keywords=['graph', "web crawler", 'network', 'visualization'],
# Packages and depencies
package_dir={'': 'src'},
packages=find_packages('src'),
install_requires=[
"requests",
"lxml",
"urllib3"
],
# Data files
package_data={
"docs":["*"]
},
# Scripts
entry_points={
},
# Other configurations
zip_safe=False,
platforms='any',
)

View File

@ -61,7 +61,7 @@ class Crawler:
if not clean: if not clean:
continue continue
self.logger.warning(f"{len(self.links)} {root}") self.logger.info(f"{len(self.links)} {root}")
try: try:
site = requests.get(root) site = requests.get(root)
tree = html.fromstring(site.content) tree = html.fromstring(site.content)

View File

@ -0,0 +1 @@
from main import mapSite

View File

@ -4,7 +4,7 @@ import json
import argparse import argparse
def transformForDrawing(n, e): def transformForPlotting(n, e):
nodes = [] nodes = []
drawn = [] drawn = []
edges = [] edges = []
@ -33,31 +33,28 @@ def graph(url, limit):
obj = Crawler() obj = Crawler()
obj.run(url, limit) obj.run(url, limit)
current = os.path.dirname(__file__) current = os.getcwd()
n, e = obj.getNodesEdges() n, e = obj.getNodesEdges()
with open(os.path.join(current, './cached/' + url.rsplit('/')[2] + '.json'), 'w', encoding='utf-8') as f: with open(os.path.join(current, './cached/' + url.rsplit('/')[2] + '.json'), 'w', encoding='utf-8') as f:
f.write(json.dumps({"nodes": n, "edges": e})) f.write(json.dumps({"nodes": n, "edges": e}))
nodes, edges = transformForDrawing(n, e) return transformForPlotting(n, e)
return nodes, edges
def load(url): def load(pathToCached):
print("Loaded from cache: " + url) with open(pathToCached, 'r', encoding='utf-8') as f:
current = os.path.dirname(__file__)
with open(os.path.join(current, './cached/{}.json'.format(url)), 'r', encoding='utf-8') as f:
content = f.read() content = f.read()
jsonContent = json.loads(content) jsonContent = json.loads(content)
return transformForDrawing(jsonContent["nodes"], jsonContent["edges"]) return transformForPlotting(jsonContent["nodes"], jsonContent["edges"])
def main(url, pathToCached): def mapSite(url, pathToCached, limit):
withoutProtocol = url.split("/")[2] withoutProtocol = url.split("/")[2]
if pathToCached is not None: if pathToCached is None:
nodes, edges = graph(url, limit) nodes, edges = graph(url, limit)
else: else:
nodes, edges = load(withoutProtocol) nodes, edges = load(pathToCached)
pathToTemplate = os.path.join(os.path.dirname( pathToTemplate = os.path.join(os.path.dirname(
__file__), "templates", "graph.html") __file__), "templates", "graph.html")
@ -81,4 +78,4 @@ if __name__ == '__main__':
pathToCached = args.plot_cached pathToCached = args.plot_cached
limit = args.limit limit = args.limit
main(url, pathToCached, limit) mapSite(url, pathToCached, limit)