Star-Mapper/urlchecker.py

import urllib.request,urllib.parse,urllib.error
from urllib.error import HTTPError
from urllib.error import URLError
from urllib.parse import urljoin
import requests
import re


class url:

    url = ""                # the url of the website to be checked
    sites = dict()          # dic. with all sites and urls on those sites
    does_work = []          # array with all prev. positiv tested urls
    does_not_work = dict()  # dic. with all not working urls and the site that linked there
    header_values = {
                'Connection:' : 'Keep-alive',
                'name' : 'Michael Foord',
                'location' : 'Northampton',
                'language' : 'English',
                'User-Agent': 'Mozilla 4/0'}

    def __init__(self, url):
        self.url = urllib.request.urlopen(url).geturl()


    def make_url(self, link, start):
        ret_link = urljoin(start, link)

        return ret_link

    def test_url(self, link, root):

        if link in self.sites or link in self.does_work:
            return True
        elif link in self.does_not_work:
            return False
        else:
            try:

                header = urllib.parse.urlencode(self.header_values)
                header=header.encode('ascii')
                request = urllib.request.Request(link, header)
                response = urllib.request.urlopen(request)
                self.does_work.append(link)
                #print(" works " + link)
                return True

            except (urllib.error.HTTPError, urllib.error.URLError, ValueError):
                self.does_not_work[link]=root
                #print(" doesn't work " + link)
                return False

    def get_actual_urls(self, links, root):
        temp_links = []
        for each_link in links:

            if each_link.startswith("http") | each_link.startswith("//"):
                temp_links.append(each_link)
            else:
                temp_links.append(urljoin(root, each_link))

        for each_temp_link in temp_links:
            self.test_url(each_temp_link, root)

        return temp_links

    def run_check(self, root=None):      # root is the url of the current Site

        if root == None:
            root = self.url
        else:
            pass

        if root in self.sites or self.url.rsplit('/', 1)[0] not in root or not self.test_url(root, root):
            return

        header = urllib.parse.urlencode(self.header_values)
        header=header.encode('ascii')

        request = urllib.request.Request(root, header)
        http_response = urllib.request.urlopen(request)
        root = http_response.geturl()
        response_data= http_response.read()


        links = re.findall(r'href="(.*?)"' , str(response_data))

        links = self.get_actual_urls(links, root)
        print(root, links)

        self.sites[root]=links
        for each_link in links:
            self.run_check(each_link)

#