This commit is contained in:
Patrice 2019-04-22 17:04:52 +02:00
parent bc2949296c
commit 6f34547432
12 changed files with 384 additions and 0 deletions

86
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,86 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File (Integrated Terminal)",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
},
{
"name": "Python: Flask",
"type": "python",
"request": "launch",
"module": "flask",
"env": {
"FLASK_APP": "app.py"
},
"args": [
"run",
"--no-debugger",
"--no-reload"
],
"jinja": true
},
{
"name": "Python: Remote Attach",
"type": "python",
"request": "attach",
"port": 5678,
"host": "localhost",
"pathMappings": [
{
"localRoot": "${workspaceFolder}",
"remoteRoot": "."
}
]
},
{
"name": "Python: Module",
"type": "python",
"request": "launch",
"module": "enter-your-module-name-here",
"console": "integratedTerminal"
},
{
"name": "Python: Django",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/manage.py",
"console": "integratedTerminal",
"args": [
"runserver",
"--noreload",
"--nothreading"
],
"django": true
},
{
"name": "Python: Flask",
"type": "python",
"request": "launch",
"module": "flask",
"env": {
"FLASK_APP": "app.py"
},
"args": [
"run",
"--no-debugger",
"--no-reload"
],
"jinja": true
},
{
"name": "Python: Current File (External Terminal)",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "externalTerminal"
}
]
}

5
Dockerfile Normal file
View File

@ -0,0 +1,5 @@
FROM tiangolo/uwsgi-nginx-flask:python3.7
COPY requirements.txt /requirements.txt
RUN pip install -r /requirements.txt
COPY ./main.py /app
COPY ./urlchecker.py /app

Binary file not shown.

Binary file not shown.

Binary file not shown.

40
app.py Normal file
View File

@ -0,0 +1,40 @@
from flask import Flask, request
import os
import urlchecker
import sitemapper
import _pickle as cPickle
import json
#----------------------------------------------------------------------------#
# App Config.
#----------------------------------------------------------------------------#
app = Flask(__name__)
#----------------------------------------------------------------------------#
# Controllers.
#----------------------------------------------------------------------------#
@app.route('/test/')
def index():
url = request.args.get("url")
print(url)
obj = sitemapper.url(url)
obj.run_check()
print(obj.sites)
with open('your_file.txt', 'w') as f:
for item in obj.sites:
f.write("%s\n" % item)
return obj.sites
if __name__ == '__main__':
port = int(os.environ.get('PORT', 80))
app.run(host='0.0.0.0', port=port)

0
data.json Normal file
View File

0
pickledump.txt Normal file
View File

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
Flask-WTF
requests

55
sitemapper.py Normal file
View File

@ -0,0 +1,55 @@
from urllib.parse import urljoin
import requests
import re
from requests_html import HTMLSession
class url:
url = "" # the url of the website to be checked
sites = dict() # dic. with all sites and urls on those sites
does_work = [] # array with all prev. positiv tested urls
does_not_work = dict() # dic. with all not working urls and the site that linked there
header_values = {
'Connection:' : 'Keep-alive',
'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'English',
'User-Agent': 'Mozilla 4/0'}
def __init__(self, url):
self.url = url
def run_check(self, root=None): # root is the url of the current Site
if root == None:
root = self.url
root = requests.get(root).url
if "Spezial" in root:
return
if root in self.sites or self.url.rsplit('/', 1)[1] not in root:
return
session = HTMLSession()
try:
response = session.get(root)
except:
return
links = response.html.absolute_links
nlinks = []
for link in links:
try:
nlinks.append(requests.get(link).url.replace("/./", "/").replace("/../", "/"))
except:
return
self.sites[root] = nlinks
print(root)
for each_link in nlinks:
self.run_check(each_link)

109
urlchecker.py Normal file
View File

@ -0,0 +1,109 @@
import urllib.request,urllib.parse,urllib.error
from urllib.error import HTTPError
from urllib.error import URLError
from urllib.parse import urljoin
import requests
import re
class url:
url = "" # the url of the website to be checked
sites = dict() # dic. with all sites and urls on those sites
does_work = [] # array with all prev. positiv tested urls
does_not_work = dict() # dic. with all not working urls and the site that linked there
header_values = {
'Connection:' : 'Keep-alive',
'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'English',
'User-Agent': 'Mozilla 4/0'}
def __init__(self, url):
self.url = urllib.request.urlopen(url).geturl()
def make_url(self, link, start):
ret_link = urljoin(start, link)
return ret_link
def test_url(self, link, root):
if link in self.sites or link in self.does_work:
return True
elif link in self.does_not_work:
return False
else:
try:
header = urllib.parse.urlencode(self.header_values)
header=header.encode('ascii')
request = urllib.request.Request(link, header)
response = urllib.request.urlopen(request)
self.does_work.append(link)
#print(" works " + link)
return True
except (urllib.error.HTTPError, urllib.error.URLError, ValueError):
self.does_not_work[link]=root
#print(" doesn't work " + link)
return False
def get_actual_urls(self, links, root):
temp_links = []
for each_link in links:
if each_link.startswith("http") | each_link.startswith("//"):
temp_links.append(each_link)
else:
temp_links.append(urljoin(root, each_link))
for each_temp_link in temp_links:
self.test_url(each_temp_link, root)
return temp_links
def run_check(self, root=None): # root is the url of the current Site
if root == None:
root = self.url
else:
pass
if root in self.sites or self.url.rsplit('/', 1)[0] not in root or not self.test_url(root, root):
return
header = urllib.parse.urlencode(self.header_values)
header=header.encode('ascii')
request = urllib.request.Request(root, header)
http_response = urllib.request.urlopen(request)
root = http_response.geturl()
response_data= http_response.read()
links = re.findall(r'href="(.*?)"' , str(response_data))
links = self.get_actual_urls(links, root)
print(root, links)
self.sites[root]=links
for each_link in links:
self.run_check(each_link)
#

86
your_file.txt Normal file
View File

@ -0,0 +1,86 @@
http://www.andreasgehrke.com/prof/moti/moti.html
http://www.andreasgehrke.com/idee/idee_18/idee_18_01.html
https://www.andreasgehrke.com/proj_21/proj_21_01.html
https://www.andreasgehrke.com/real_proj/real_proj_12/real_proj_12_01.html
https://www.andreasgehrke.com/idee/idee_01.html#i01
http://www.andreasgehrke.com/real_proj/real_proj_10/real_proj_10_01.html
https://www.andreasgehrke.com/real_proj/real_proj_09/real_proj_09_01.html
https://www.andreasgehrke.com/proj/proj.html
http://www.andreasgehrke.com/../proj/proj.html
http://www.andreasgehrke.com/leis/leis.html
https://www.andreasgehrke.com/idee/idee_03/idee_03_01.html
http://www.andreasgehrke.com/idee/idee_16/idee_16_01.html
http://www.andreasgehrke.com/real_proj/real_proj_07/real_proj_07_01.html
http://www.andreasgehrke.com/idee/idee_02/idee_02_01.html
https://www.andreasgehrke.com/real_proj/real_proj_11.html#r11
http://www.andreasgehrke.com/real_proj/real_proj_09/real_proj_09_01.html
https://www.andreasgehrke.com/idee/idee_20/idee_20.html
http://www.andreasgehrke.com/proj_21/proj_21_01.html
https://www.andreasgehrke.com/idee/idee_08.html#i08
https://www.andreasgehrke.com/idee/idee_11/idee_11_01.html
http://www.andreasgehrke.com/real_proj/real_proj_06/real_proj_06_01.html
http://www.andreasgehrke.com/proj.html
http://www.andreasgehrke.com/moti/moti.html
https://www.andreasgehrke.com/idee/idee_18/idee_18_01.html
http://www.andreasgehrke.com/ver/ver.html
https://www.andreasgehrke.com/idee/idee_01/idee_02_01.html
https://www.andreasgehrke.com/real_proj/real_proj_13.html
https://www.andreasgehrke.com/real_proj/real_proj_06/real_proj_06_01.html
http://www.andreasgehrke.com/idee/idee_17/idee_17_01.html
http://www.andreasgehrke.com/proj/proj.html
https://www.andreasgehrke.com/../proj/proj.html
http://www.andreasgehrke.com/real_proj/real_proj_01/real_proj_01_01.html
https://www.andreasgehrke.com/idee/idee_16/idee_16_01.html
http://www.andreasgehrke.com/idee/idee_01/idee_02_01.html
https://www.andreasgehrke.com/real_proj/real_proj_04/real_proj_04_01.html
https://www.andreasgehrke.com/leis/leis.html
https://www.andreasgehrke.com/ver/ver.html
https://www.andreasgehrke.com/idee/idee_04/idee_04_01.html
http://www.andreasgehrke.com/idee/idee_20/idee_20.html
http://www.andreasgehrke.com/real_proj/real_proj_13.html
https://www.andreasgehrke.com/real_proj/real_proj_08/real_proj_08_01.html
https://www.andreasgehrke.com/
https://www.andreasgehrke.com/idee/idee_06.html#i06
http://www.andreasgehrke.com/../index.html
http://www.andreasgehrke.com/index.html
http://www.andreasgehrke.com/proj_23/proj_23_01.html
http://www.andreasgehrke.com/idee/idee_04/idee_04_01.html
https://www.andreasgehrke.com/idee/idee_19/idee_19_01.html
https://www.andreasgehrke.com/real_proj/real_proj_01/real_proj_01_01.html
https://www.andreasgehrke.com/real_proj/real_proj_10/real_proj_10_01.html
https://www.andreasgehrke.com/pers/pers.html
http://www.andreasgehrke.com/idee/idee_07.html#i07
https://www.andreasgehrke.com/proj_23/proj_23_01.html
https://www.andreasgehrke.com/idee/idee_17/idee_17_01.html
http://www.andreasgehrke.com/real_proj/real_proj_04/real_proj_04_01.html
http://www.andreasgehrke.com/real_proj/real_proj_08/real_proj_08_01.html
https://www.andreasgehrke.com/proj_22/proj_22_01.html
https://www.andreasgehrke.com/moti/moti.html
http://www.andreasgehrke.com/idee/idee_01.html#i01
https://www.andreasgehrke.com/idee/idee_07.html#i07
http://www.andreasgehrke.com/idee/idee_03/idee_03_01.html
http://www.andreasgehrke.com/idee/idee_08.html#i08
http://www.andreasgehrke.com/idee/idee_19/idee_19_01.html
http://www.andreasgehrke.com/ref/ref.html
https://www.andreasgehrke.com/prof/moti/moti.html
http://www.andreasgehrke.com/idee/idee_06.html#i06
http://www.andreasgehrke.com/idee/idee_11/idee_11_01.html
https://www.andreasgehrke.com/idee/idee_05.html#i05
https://www.andreasgehrke.com/../kont/buer/buer.html
https://www.andreasgehrke.com/idee/idee_09/idee_09_01.html
https://www.andreasgehrke.com/index.html
http://www.andreasgehrke.com/idee/idee_09/idee_09_01.html
https://www.andreasgehrke.com/ref/ref.html
http://www.andreasgehrke.com/real_proj/real_proj_11.html#r11
https://www.andreasgehrke.com/idee/idee_02/idee_02_01.html
https://www.andreasgehrke.com/real_proj/real_proj_05/real_proj_05_01.html
https://www.andreasgehrke.com/real_proj/real_proj_07/real_proj_07_01.html
http://www.andreasgehrke.com/real_proj/real_proj_12/real_proj_12_01.html
http://www.andreasgehrke.com/proj_22/proj_22_01.html
https://www.andreasgehrke.com/proj.html
http://www.andreasgehrke.com/real_proj/real_proj_05/real_proj_05_01.html
http://www.andreasgehrke.com/pers/pers.html
http://www.andreasgehrke.com/
https://www.andreasgehrke.com/../index.html
http://www.andreasgehrke.com/../kont/buer/buer.html
http://www.andreasgehrke.com/idee/idee_05.html#i05