from urllib.parse import urljoin import requests import re from requests_html import HTMLSession class url: url = "" # the url of the website to be checked sites = dict() # dic. with all sites and urls on those sites does_work = [] # array with all prev. positiv tested urls does_not_work = dict() # dic. with all not working urls and the site that linked there header_values = { 'Connection:' : 'Keep-alive', 'name' : 'Michael Foord', 'location' : 'Northampton', 'language' : 'English', 'User-Agent': 'Mozilla 4/0'} def __init__(self, url): self.url = url def run_check(self, root=None): # root is the url of the current Site if root == None: root = self.url root = requests.get(root).url if "Spezial" in root: return if root in self.sites or self.url.rsplit('/', 1)[1] not in root: return session = HTMLSession() try: response = session.get(root) except: return links = response.html.absolute_links nlinks = [] for link in links: try: nlinks.append(requests.get(link).url.replace("/./", "/").replace("/../", "/")) except: return self.sites[root] = nlinks print(root) for each_link in nlinks: self.run_check(each_link)