Star-Mapper/sitemapper.py

from urllib.parse import urljoin
from lxml import html
import requests


class url:

    url = ""                # the url of the website to be checked
    sites = dict()          # dic. with all sites and urls on those sites
    header_values = {
                'Connection:' : 'Keep-alive',
                'name' : 'Michael Foord',
                'location' : 'Northampton',
                'language' : 'English',
                'User-Agent': 'Mozilla 4/0'}

    exclude = {
        "title=Spezial",
        "offset=", 
        "#",
        "&"
    }

    def __init__(self, url):
        self.url = url
  

    def run_check(self, root=None):      # root is the url of the current Site
        
        if root in self.sites or self.url.rsplit('/')[2] not in root:
            #print(self.url.rsplit('/')[2])
            return
        if "https" not in root:
            return
        for element in self.exclude:
            if element in root:
                return
        print(root)
        try:
            site = requests.get(root)
            tree = html.fromstring(site.content)
            links = tree.xpath('//a/@href')
            #print(links)
        except:
            return
                
        nlinks = []
        for link in links:
            if link not in nlinks:
                if link.startswith("http"):
                    nlinks.append(link)
                else:
                    nlinks.append(urljoin(site.url, link))

        self.sites[root] = nlinks

        for each_link in nlinks:         
            self.run_check(each_link)
meh 2019-04-22 15:04:52 +00:00			`from urllib.parse import urljoin`
WORKS :) 2019-04-27 12:23:56 +00:00			`from lxml import html`
meh 2019-04-22 15:04:52 +00:00			`import requests`


			`class url:`

			`url = "" # the url of the website to be checked`
			`sites = dict() # dic. with all sites and urls on those sites`
			`header_values = {`
			`'Connection:' : 'Keep-alive',`
			`'name' : 'Michael Foord',`
			`'location' : 'Northampton',`
			`'language' : 'English',`
			`'User-Agent': 'Mozilla 4/0'}`

WORKS :) 2019-04-27 12:23:56 +00:00			`exclude = {`
			`"title=Spezial",`
return graph 2019-04-29 09:03:24 +00:00			`"offset=",`
			`"#",`
			`"&"`
WORKS :) 2019-04-27 12:23:56 +00:00			`}`

meh 2019-04-22 15:04:52 +00:00			`def __init__(self, url):`
			`self.url = url`


			`def run_check(self, root=None): # root is the url of the current Site`

WORKS :) 2019-04-27 12:23:56 +00:00			`if root in self.sites or self.url.rsplit('/')[2] not in root:`
			`#print(self.url.rsplit('/')[2])`
meh 2019-04-22 15:04:52 +00:00			`return`
persist + edge nodes 2019-05-01 12:26:11 +00:00			`if "https" not in root:`
			`return`
WORKS :) 2019-04-27 12:23:56 +00:00			`for element in self.exclude:`
			`if element in root:`
			`return`
return graph 2019-04-29 09:03:24 +00:00			`print(root)`
meh 2019-04-22 15:04:52 +00:00			`try:`
WORKS :) 2019-04-27 12:23:56 +00:00			`site = requests.get(root)`
			`tree = html.fromstring(site.content)`
			`links = tree.xpath('//a/@href')`
			`#print(links)`
meh 2019-04-22 15:04:52 +00:00			`except:`
			`return`
WORKS :) 2019-04-27 12:23:56 +00:00
meh 2019-04-22 15:04:52 +00:00			`nlinks = []`
			`for link in links:`
WORKS :) 2019-04-27 12:23:56 +00:00			`if link not in nlinks:`
			`if link.startswith("http"):`
			`nlinks.append(link)`
			`else:`
			`nlinks.append(urljoin(site.url, link))`
meh 2019-04-22 15:04:52 +00:00
WORKS :) 2019-04-27 12:23:56 +00:00			`self.sites[root] = nlinks`
meh 2019-04-22 15:04:52 +00:00
			`for each_link in nlinks:`
			`self.run_check(each_link)`