Alexa-news-stentiment-evalu.../reader/urlchecker.py

135 lines
3.7 KiB
Python

import urllib.request,urllib.parse,urllib.error
from urllib.error import HTTPError
from urllib.error import URLError
from urllib.parse import urljoin
import requests
import re
import networkx as nx
import matplotlib.pyplot as plt
class url:
url = "" # the url of the website to be checked
sites = dict() # dic. with all sites and urls on those sites
does_work = [] # array with all prev. positiv tested urls
does_not_work = dict() # dic. with all not working urls and the site that linked there
header_values = {
'Connection:' : 'Keep-alive',
'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'English',
'User-Agent': 'Mozilla 4/0'}
def __init__(self, url):
self.url = urllib.request.urlopen(url).geturl()
def make_url(self, link, start):
ret_link = urljoin(start, link)
return ret_link
def test_url(self, link, root):
if link in self.sites or link in self.does_work:
return True
elif link in self.does_not_work:
return False
else:
try:
header = urllib.parse.urlencode(self.header_values)
header=header.encode('ascii')
request = urllib.request.Request(link, header)
response = urllib.request.urlopen(request)
self.does_work.append(link)
print(" works " + link)
return True
except (urllib.error.HTTPError, urllib.error.URLError, ValueError):
self.does_not_work[link]=root
print(" doesn't work " + link)
return False
def get_actual_urls(self, links, root):
temp_links = []
for each_link in links:
if each_link.startswith("http") | each_link.startswith("//"):
temp_links.append(each_link)
else:
temp_links.append(urljoin(root, each_link))
for each_temp_link in temp_links:
self.test_url(each_temp_link, root)
return temp_links
def run_check(self, root=None): # root is the url of the current Site
if root == None:
root = self.url
else:
pass
if root in self.sites or self.url.rsplit('/', 1)[0] not in root or not self.test_url(root, root):
return
header = urllib.parse.urlencode(self.header_values)
header=header.encode('ascii')
request = urllib.request.Request(root, header)
http_response = urllib.request.urlopen(request)
root = http_response.geturl()
response_data= http_response.read()
links = re.findall(r'href="(.*?)"' , str(response_data))
links = self.get_actual_urls(links, root)
self.sites[root]=links
for each_link in links:
self.run_check(each_link)
def graph(self):
G = nx.Graph(self.sites)
label_dict = {}
for key, value in self.sites.items(): #that's not how it works... todo: later
label_dict[key]=self.remove_root(value)
nx.draw(G, with_labels=True, font_size=8 , node_size=1000, node_color="skyblue", edge_color='#A0FFA2', pos=nx.spring_layout(G))
plt.show()
def remove_root(self, links):
ret_links = []
for link in links:
ret_links.append(link.rsplit('.', 1)[0])
return ret_links
def clean(self):
self.sites.clear()
self.does_not_work.clear()
self.does_work.clear()