55 lines
1.5 KiB
Python
55 lines
1.5 KiB
Python
|
|
from urllib.parse import urljoin
|
||
|
|
import requests
|
||
|
|
import re
|
||
|
|
from requests_html import HTMLSession
|
||
|
|
|
||
|
|
|
||
|
|
class url:
|
||
|
|
|
||
|
|
url = "" # the url of the website to be checked
|
||
|
|
sites = dict() # dic. with all sites and urls on those sites
|
||
|
|
does_work = [] # array with all prev. positiv tested urls
|
||
|
|
does_not_work = dict() # dic. with all not working urls and the site that linked there
|
||
|
|
header_values = {
|
||
|
|
'Connection:' : 'Keep-alive',
|
||
|
|
'name' : 'Michael Foord',
|
||
|
|
'location' : 'Northampton',
|
||
|
|
'language' : 'English',
|
||
|
|
'User-Agent': 'Mozilla 4/0'}
|
||
|
|
|
||
|
|
def __init__(self, url):
|
||
|
|
self.url = url
|
||
|
|
|
||
|
|
|
||
|
|
def run_check(self, root=None): # root is the url of the current Site
|
||
|
|
|
||
|
|
if root == None:
|
||
|
|
root = self.url
|
||
|
|
|
||
|
|
root = requests.get(root).url
|
||
|
|
if "Spezial" in root:
|
||
|
|
return
|
||
|
|
if root in self.sites or self.url.rsplit('/', 1)[1] not in root:
|
||
|
|
return
|
||
|
|
|
||
|
|
session = HTMLSession()
|
||
|
|
|
||
|
|
try:
|
||
|
|
response = session.get(root)
|
||
|
|
except:
|
||
|
|
return
|
||
|
|
|
||
|
|
links = response.html.absolute_links
|
||
|
|
nlinks = []
|
||
|
|
for link in links:
|
||
|
|
try:
|
||
|
|
nlinks.append(requests.get(link).url.replace("/./", "/").replace("/../", "/"))
|
||
|
|
except:
|
||
|
|
return
|
||
|
|
self.sites[root] = nlinks
|
||
|
|
|
||
|
|
print(root)
|
||
|
|
|
||
|
|
for each_link in nlinks:
|
||
|
|
self.run_check(each_link)
|
||
|
|
|