Star-Mapper/sitemapper.py

from urllib.parse import urljoin
from lxml import html
import requests


class url:

    url = ""                # the url of the website to be checked
    sites = dict()          # dic. with all sites and urls on those sites
    header_values = {
                'Connection:' : 'Keep-alive',
                'name' : 'Michael Foord',
                'location' : 'Northampton',
                'language' : 'English',
                'User-Agent': 'Mozilla 4/0'}

    exclude = {
        "title=Spezial",
        "doodles/",
        "#",
        "&"
    }

    def __init__(self, url):
        self.url = url


    def run_check(self, root=None):      # root is the url of the current Site

        if root in self.sites or self.url.rsplit('/')[2] not in root:
            #print(self.url.rsplit('/')[2])
            return
        if "https" not in root:
            return
        for element in self.exclude:
            if element in root:
                return
        print(root)
        try:
            site = requests.get(root)
            tree = html.fromstring(site.content)
            links = tree.xpath('//a/@href')
            #print(links)
        except:
            return

        nlinks = []
        for link in links:
            if link not in nlinks:
                if link.startswith("http"):
                    nlinks.append(link)
                else:
                    nlinks.append(urljoin(site.url, link))

        self.sites[root] = nlinks

        for each_link in nlinks:
            self.run_check(each_link)