2019-04-22 15:04:52 +00:00
|
|
|
from urllib.parse import urljoin
|
2019-04-27 12:23:56 +00:00
|
|
|
from lxml import html
|
2019-04-22 15:04:52 +00:00
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class url:
|
|
|
|
|
|
|
|
|
|
url = "" # the url of the website to be checked
|
|
|
|
|
sites = dict() # dic. with all sites and urls on those sites
|
|
|
|
|
does_work = [] # array with all prev. positiv tested urls
|
|
|
|
|
does_not_work = dict() # dic. with all not working urls and the site that linked there
|
|
|
|
|
header_values = {
|
|
|
|
|
'Connection:' : 'Keep-alive',
|
|
|
|
|
'name' : 'Michael Foord',
|
|
|
|
|
'location' : 'Northampton',
|
|
|
|
|
'language' : 'English',
|
|
|
|
|
'User-Agent': 'Mozilla 4/0'}
|
|
|
|
|
|
2019-04-27 12:23:56 +00:00
|
|
|
exclude = {
|
|
|
|
|
"title=Spezial",
|
|
|
|
|
"offset="
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-22 15:04:52 +00:00
|
|
|
def __init__(self, url):
|
|
|
|
|
self.url = url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_check(self, root=None): # root is the url of the current Site
|
|
|
|
|
|
2019-04-27 12:23:56 +00:00
|
|
|
if root in self.sites or self.url.rsplit('/')[2] not in root:
|
|
|
|
|
#print(self.url.rsplit('/')[2])
|
2019-04-22 15:04:52 +00:00
|
|
|
return
|
|
|
|
|
|
2019-04-27 12:23:56 +00:00
|
|
|
for element in self.exclude:
|
|
|
|
|
if element in root:
|
|
|
|
|
return
|
|
|
|
|
#print(root)
|
2019-04-22 15:04:52 +00:00
|
|
|
try:
|
2019-04-27 12:23:56 +00:00
|
|
|
site = requests.get(root)
|
|
|
|
|
tree = html.fromstring(site.content)
|
|
|
|
|
links = tree.xpath('//a/@href')
|
|
|
|
|
#print(links)
|
2019-04-22 15:04:52 +00:00
|
|
|
except:
|
|
|
|
|
return
|
2019-04-27 12:23:56 +00:00
|
|
|
|
2019-04-22 15:04:52 +00:00
|
|
|
nlinks = []
|
|
|
|
|
for link in links:
|
2019-04-27 12:23:56 +00:00
|
|
|
if link not in nlinks:
|
|
|
|
|
if link.startswith("http"):
|
|
|
|
|
nlinks.append(link)
|
|
|
|
|
else:
|
|
|
|
|
nlinks.append(urljoin(site.url, link))
|
|
|
|
|
print(site.url, link)
|
|
|
|
|
print(urljoin(site.url, link))
|
|
|
|
|
|
2019-04-22 15:04:52 +00:00
|
|
|
|
2019-04-27 12:23:56 +00:00
|
|
|
self.sites[root] = nlinks
|
2019-04-22 15:04:52 +00:00
|
|
|
|
|
|
|
|
for each_link in nlinks:
|
|
|
|
|
self.run_check(each_link)
|
|
|
|
|
|