This commit is contained in:
parent
d61dc1fdac
commit
4a8b341af9
109
urlchecker.py
109
urlchecker.py
|
|
@ -1,109 +0,0 @@
|
|||
import urllib.request,urllib.parse,urllib.error
|
||||
from urllib.error import HTTPError
|
||||
from urllib.error import URLError
|
||||
from urllib.parse import urljoin
|
||||
import requests
|
||||
import re
|
||||
|
||||
|
||||
|
||||
class url:
|
||||
|
||||
url = "" # the url of the website to be checked
|
||||
sites = dict() # dic. with all sites and urls on those sites
|
||||
does_work = [] # array with all prev. positiv tested urls
|
||||
does_not_work = dict() # dic. with all not working urls and the site that linked there
|
||||
header_values = {
|
||||
'Connection:' : 'Keep-alive',
|
||||
'name' : 'Michael Foord',
|
||||
'location' : 'Northampton',
|
||||
'language' : 'English',
|
||||
'User-Agent': 'Mozilla 4/0'}
|
||||
|
||||
def __init__(self, url):
|
||||
self.url = urllib.request.urlopen(url).geturl()
|
||||
|
||||
|
||||
def make_url(self, link, start):
|
||||
ret_link = urljoin(start, link)
|
||||
|
||||
return ret_link
|
||||
|
||||
def test_url(self, link, root):
|
||||
|
||||
if link in self.sites or link in self.does_work:
|
||||
return True
|
||||
elif link in self.does_not_work:
|
||||
return False
|
||||
else:
|
||||
try:
|
||||
|
||||
header = urllib.parse.urlencode(self.header_values)
|
||||
header=header.encode('ascii')
|
||||
request = urllib.request.Request(link, header)
|
||||
response = urllib.request.urlopen(request)
|
||||
self.does_work.append(link)
|
||||
#print(" works " + link)
|
||||
return True
|
||||
|
||||
except (urllib.error.HTTPError, urllib.error.URLError, ValueError):
|
||||
self.does_not_work[link]=root
|
||||
#print(" doesn't work " + link)
|
||||
return False
|
||||
|
||||
def get_actual_urls(self, links, root):
|
||||
temp_links = []
|
||||
for each_link in links:
|
||||
|
||||
if each_link.startswith("http") | each_link.startswith("//"):
|
||||
temp_links.append(each_link)
|
||||
else:
|
||||
temp_links.append(urljoin(root, each_link))
|
||||
|
||||
for each_temp_link in temp_links:
|
||||
self.test_url(each_temp_link, root)
|
||||
|
||||
return temp_links
|
||||
|
||||
def run_check(self, root=None): # root is the url of the current Site
|
||||
|
||||
if root == None:
|
||||
root = self.url
|
||||
else:
|
||||
pass
|
||||
|
||||
if root in self.sites or self.url.rsplit('/', 1)[0] not in root or not self.test_url(root, root):
|
||||
return
|
||||
|
||||
header = urllib.parse.urlencode(self.header_values)
|
||||
header=header.encode('ascii')
|
||||
|
||||
request = urllib.request.Request(root, header)
|
||||
http_response = urllib.request.urlopen(request)
|
||||
root = http_response.geturl()
|
||||
response_data= http_response.read()
|
||||
|
||||
|
||||
links = re.findall(r'href="(.*?)"' , str(response_data))
|
||||
|
||||
links = self.get_actual_urls(links, root)
|
||||
print(root, links)
|
||||
|
||||
self.sites[root]=links
|
||||
for each_link in links:
|
||||
self.run_check(each_link)
|
||||
|
||||
#
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue