added core func for Golem
This commit is contained in:
parent
517db5be29
commit
401d3bd8ba
|
Before Width: | Height: | Size: 39 KiB After Width: | Height: | Size: 39 KiB |
|
Before Width: | Height: | Size: 3.4 KiB After Width: | Height: | Size: 3.4 KiB |
|
|
@ -0,0 +1,61 @@
|
||||||
|
import urllib.request,urllib.parse,urllib.error
|
||||||
|
from lxml import html
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class site:
|
||||||
|
url = ""
|
||||||
|
header_values = {
|
||||||
|
'Connection:' : 'Keep-alive',
|
||||||
|
'name' : 'Michael Foord',
|
||||||
|
'location' : 'Northampton',
|
||||||
|
'language' : 'German',
|
||||||
|
'User-Agent': 'Mozilla 4/0'}
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = urllib.request.urlopen(url).geturl()
|
||||||
|
|
||||||
|
def search_article(self, topic):
|
||||||
|
return False
|
||||||
|
def get_news(self):
|
||||||
|
return False
|
||||||
|
def read_article(self, url):
|
||||||
|
return False
|
||||||
|
def read_headlines(self, url):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class Golem(site):
|
||||||
|
def search_article(self, topic):
|
||||||
|
searchURL = "https://suche.golem.de/search.php?l=10&q=" + topic.replace(" ", "+")
|
||||||
|
site = requests.get(searchURL, headers=self.header_values)
|
||||||
|
tree = html.fromstring(site.content)
|
||||||
|
|
||||||
|
articles = tree.xpath('//span[@class="dh2 head2"]/text()')
|
||||||
|
return articles
|
||||||
|
|
||||||
|
def get_news(self):
|
||||||
|
searchURL = "https://www.golem.de/"
|
||||||
|
site = requests.get(searchURL, headers=self.header_values)
|
||||||
|
tree = html.fromstring(site.content)
|
||||||
|
|
||||||
|
articles = tree.xpath('//h2[@class="head2"]/text()')
|
||||||
|
return articles
|
||||||
|
|
||||||
|
def read_headlines(self, url):
|
||||||
|
site = requests.get(url, headers=self.header_values)
|
||||||
|
tree = html.fromstring(site.content)
|
||||||
|
|
||||||
|
title = tree.xpath('//header/h1/span[@class="dh1 head5"]/text()')
|
||||||
|
title += tree.xpath('//header/p/text()')
|
||||||
|
return title
|
||||||
|
|
||||||
|
def read_article(self, url):
|
||||||
|
site = requests.get(url, headers=self.header_values)
|
||||||
|
tree = html.fromstring(site.content)
|
||||||
|
|
||||||
|
title = self.read_headlines(url)
|
||||||
|
title += tree.xpath('//div[@class="formatted"]/p/text()')
|
||||||
|
return title
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
import urllib.request,urllib.parse,urllib.error
|
||||||
|
from lxml import html
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
|
||||||
|
url="https://www.golem.de/news/tchap-forscher-gelingt-anmeldung-im-regierungschat-frankreichs-1904-140799.html"
|
||||||
|
site = requests.get(url)
|
||||||
|
tree = html.fromstring(site.content)
|
||||||
|
title = tree.xpath('//div[@class="formatted"]/p/text()')
|
||||||
|
print(title)
|
||||||
|
|
@ -0,0 +1,135 @@
|
||||||
|
import urllib.request,urllib.parse,urllib.error
|
||||||
|
from urllib.error import HTTPError
|
||||||
|
from urllib.error import URLError
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
|
||||||
|
import networkx as nx
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
class url:
|
||||||
|
|
||||||
|
url = "" # the url of the website to be checked
|
||||||
|
sites = dict() # dic. with all sites and urls on those sites
|
||||||
|
does_work = [] # array with all prev. positiv tested urls
|
||||||
|
does_not_work = dict() # dic. with all not working urls and the site that linked there
|
||||||
|
header_values = {
|
||||||
|
'Connection:' : 'Keep-alive',
|
||||||
|
'name' : 'Michael Foord',
|
||||||
|
'location' : 'Northampton',
|
||||||
|
'language' : 'English',
|
||||||
|
'User-Agent': 'Mozilla 4/0'}
|
||||||
|
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = urllib.request.urlopen(url).geturl()
|
||||||
|
|
||||||
|
|
||||||
|
def make_url(self, link, start):
|
||||||
|
ret_link = urljoin(start, link)
|
||||||
|
|
||||||
|
return ret_link
|
||||||
|
|
||||||
|
def test_url(self, link, root):
|
||||||
|
|
||||||
|
if link in self.sites or link in self.does_work:
|
||||||
|
return True
|
||||||
|
elif link in self.does_not_work:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
|
||||||
|
header = urllib.parse.urlencode(self.header_values)
|
||||||
|
header=header.encode('ascii')
|
||||||
|
request = urllib.request.Request(link, header)
|
||||||
|
response = urllib.request.urlopen(request)
|
||||||
|
self.does_work.append(link)
|
||||||
|
print(" works " + link)
|
||||||
|
return True
|
||||||
|
|
||||||
|
except (urllib.error.HTTPError, urllib.error.URLError, ValueError):
|
||||||
|
self.does_not_work[link]=root
|
||||||
|
print(" doesn't work " + link)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_actual_urls(self, links, root):
|
||||||
|
temp_links = []
|
||||||
|
for each_link in links:
|
||||||
|
|
||||||
|
if each_link.startswith("http") | each_link.startswith("//"):
|
||||||
|
temp_links.append(each_link)
|
||||||
|
else:
|
||||||
|
temp_links.append(urljoin(root, each_link))
|
||||||
|
|
||||||
|
for each_temp_link in temp_links:
|
||||||
|
self.test_url(each_temp_link, root)
|
||||||
|
|
||||||
|
return temp_links
|
||||||
|
|
||||||
|
def run_check(self, root=None): # root is the url of the current Site
|
||||||
|
|
||||||
|
if root == None:
|
||||||
|
root = self.url
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if root in self.sites or self.url.rsplit('/', 1)[0] not in root or not self.test_url(root, root):
|
||||||
|
return
|
||||||
|
|
||||||
|
header = urllib.parse.urlencode(self.header_values)
|
||||||
|
header=header.encode('ascii')
|
||||||
|
|
||||||
|
request = urllib.request.Request(root, header)
|
||||||
|
http_response = urllib.request.urlopen(request)
|
||||||
|
root = http_response.geturl()
|
||||||
|
response_data= http_response.read()
|
||||||
|
|
||||||
|
|
||||||
|
links = re.findall(r'href="(.*?)"' , str(response_data))
|
||||||
|
|
||||||
|
links = self.get_actual_urls(links, root)
|
||||||
|
|
||||||
|
|
||||||
|
self.sites[root]=links
|
||||||
|
for each_link in links:
|
||||||
|
self.run_check(each_link)
|
||||||
|
|
||||||
|
|
||||||
|
def graph(self):
|
||||||
|
|
||||||
|
G = nx.Graph(self.sites)
|
||||||
|
|
||||||
|
label_dict = {}
|
||||||
|
for key, value in self.sites.items(): #that's not how it works... todo: later
|
||||||
|
label_dict[key]=self.remove_root(value)
|
||||||
|
|
||||||
|
nx.draw(G, with_labels=True, font_size=8 , node_size=1000, node_color="skyblue", edge_color='#A0FFA2', pos=nx.spring_layout(G))
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def remove_root(self, links):
|
||||||
|
ret_links = []
|
||||||
|
for link in links:
|
||||||
|
ret_links.append(link.rsplit('.', 1)[0])
|
||||||
|
|
||||||
|
return ret_links
|
||||||
|
|
||||||
|
def clean(self):
|
||||||
|
self.sites.clear()
|
||||||
|
self.does_not_work.clear()
|
||||||
|
self.does_work.clear()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Loading…
Reference in New Issue