Star-Mapper/sitemapper.py

59 lines
1.6 KiB
Python
Raw Normal View History

2019-04-22 15:04:52 +00:00
from urllib.parse import urljoin
2019-04-27 12:23:56 +00:00
from lxml import html
2019-04-22 15:04:52 +00:00
import requests
class url:
url = "" # the url of the website to be checked
sites = dict() # dic. with all sites and urls on those sites
header_values = {
'Connection:' : 'Keep-alive',
'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'English',
'User-Agent': 'Mozilla 4/0'}
2019-04-27 12:23:56 +00:00
exclude = {
"title=Spezial",
2019-04-29 09:03:24 +00:00
"offset=",
"#",
"&"
2019-04-27 12:23:56 +00:00
}
2019-04-22 15:04:52 +00:00
def __init__(self, url):
self.url = url
def run_check(self, root=None): # root is the url of the current Site
2019-04-27 12:23:56 +00:00
if root in self.sites or self.url.rsplit('/')[2] not in root:
#print(self.url.rsplit('/')[2])
2019-04-22 15:04:52 +00:00
return
2019-05-01 12:26:11 +00:00
if "https" not in root:
return
2019-04-27 12:23:56 +00:00
for element in self.exclude:
if element in root:
return
2019-04-29 09:03:24 +00:00
print(root)
2019-04-22 15:04:52 +00:00
try:
2019-04-27 12:23:56 +00:00
site = requests.get(root)
tree = html.fromstring(site.content)
links = tree.xpath('//a/@href')
#print(links)
2019-04-22 15:04:52 +00:00
except:
return
2019-04-27 12:23:56 +00:00
2019-04-22 15:04:52 +00:00
nlinks = []
for link in links:
2019-04-27 12:23:56 +00:00
if link not in nlinks:
if link.startswith("http"):
nlinks.append(link)
else:
nlinks.append(urljoin(site.url, link))
2019-04-22 15:04:52 +00:00
2019-04-27 12:23:56 +00:00
self.sites[root] = nlinks
2019-04-22 15:04:52 +00:00
for each_link in nlinks:
self.run_check(each_link)