Star-Mapper/sitemapper.py

61 lines
1.7 KiB
Python
Raw Normal View History

2019-04-22 15:04:52 +00:00
from urllib.parse import urljoin
2019-04-27 12:23:56 +00:00
from lxml import html
2019-04-22 15:04:52 +00:00
import requests
class url:
url = "" # the url of the website to be checked
sites = dict() # dic. with all sites and urls on those sites
does_work = [] # array with all prev. positiv tested urls
does_not_work = dict() # dic. with all not working urls and the site that linked there
header_values = {
'Connection:' : 'Keep-alive',
'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'English',
'User-Agent': 'Mozilla 4/0'}
2019-04-27 12:23:56 +00:00
exclude = {
"title=Spezial",
"offset="
}
2019-04-22 15:04:52 +00:00
def __init__(self, url):
self.url = url
def run_check(self, root=None): # root is the url of the current Site
2019-04-27 12:23:56 +00:00
if root in self.sites or self.url.rsplit('/')[2] not in root:
#print(self.url.rsplit('/')[2])
2019-04-22 15:04:52 +00:00
return
2019-04-27 12:23:56 +00:00
for element in self.exclude:
if element in root:
return
#print(root)
2019-04-22 15:04:52 +00:00
try:
2019-04-27 12:23:56 +00:00
site = requests.get(root)
tree = html.fromstring(site.content)
links = tree.xpath('//a/@href')
#print(links)
2019-04-22 15:04:52 +00:00
except:
return
2019-04-27 12:23:56 +00:00
2019-04-22 15:04:52 +00:00
nlinks = []
for link in links:
2019-04-27 12:23:56 +00:00
if link not in nlinks:
if link.startswith("http"):
nlinks.append(link)
else:
nlinks.append(urljoin(site.url, link))
print(site.url, link)
print(urljoin(site.url, link))
2019-04-22 15:04:52 +00:00
2019-04-27 12:23:56 +00:00
self.sites[root] = nlinks
2019-04-22 15:04:52 +00:00
for each_link in nlinks:
self.run_check(each_link)