2019-04-26 20:53:04 +00:00
|
|
|
import urllib.request,urllib.parse,urllib.error
|
|
|
|
|
from lxml import html
|
|
|
|
|
import requests
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Site:
|
2019-05-02 18:04:10 +00:00
|
|
|
siteName = ""
|
|
|
|
|
baseURL = ""
|
|
|
|
|
searchURLString = ""
|
|
|
|
|
xPath = dict()
|
|
|
|
|
xPath["searchArticle"] = ""
|
|
|
|
|
xPath["searchLinks"] = ""
|
|
|
|
|
xPath["newsArticle"] = ""
|
2019-05-07 18:10:28 +00:00
|
|
|
xPath["newsLinks"] = ""
|
2019-05-02 18:04:10 +00:00
|
|
|
xPath["readHeadlineTitle"] = ""
|
|
|
|
|
xPath["readHeadlineText"] = ""
|
|
|
|
|
xPath["readArticleText"] = ""
|
|
|
|
|
|
2019-04-26 20:53:04 +00:00
|
|
|
def __init__(self):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def search_article(self, topic):
|
2019-05-02 18:04:10 +00:00
|
|
|
searchURL = self.searchURLString + topic.replace(" ", "+")
|
2019-04-26 20:53:04 +00:00
|
|
|
site = requests.get(searchURL)
|
|
|
|
|
tree = html.fromstring(site.content)
|
|
|
|
|
|
2019-05-02 18:04:10 +00:00
|
|
|
articles = tree.xpath(self.xPath["searchArticle"])
|
|
|
|
|
links = tree.xpath(self.xPath["searchLinks"])
|
2019-04-26 20:53:04 +00:00
|
|
|
return articles, links
|
|
|
|
|
|
|
|
|
|
def get_news(self):
|
2019-05-02 18:04:10 +00:00
|
|
|
searchURL = self.baseURL
|
2019-04-26 20:53:04 +00:00
|
|
|
site = requests.get(searchURL)
|
|
|
|
|
tree = html.fromstring(site.content)
|
|
|
|
|
|
2019-05-02 18:04:10 +00:00
|
|
|
articles = tree.xpath(self.xPath["newsArticle"])
|
2019-05-07 18:10:28 +00:00
|
|
|
links = tree.xpath(self.xPath["newsLinks"])
|
|
|
|
|
return articles, links
|
2019-04-26 20:53:04 +00:00
|
|
|
|
|
|
|
|
def read_headlines(self, url):
|
|
|
|
|
site = requests.get(url)
|
|
|
|
|
tree = html.fromstring(site.content)
|
|
|
|
|
|
2019-05-02 18:04:10 +00:00
|
|
|
title = tree.xpath(self.xPath["readHeadlineTitle"] )
|
|
|
|
|
title += tree.xpath(self.xPath["readHeadlineText"])
|
2019-04-26 20:53:04 +00:00
|
|
|
return title
|
|
|
|
|
|
2019-05-13 09:53:13 +00:00
|
|
|
# not used, who wants to listen to alexa for 10 minutes?
|
2019-04-26 20:53:04 +00:00
|
|
|
def read_article(self, url):
|
|
|
|
|
site = requests.get(url)
|
|
|
|
|
tree = html.fromstring(site.content)
|
|
|
|
|
|
2019-05-13 09:53:13 +00:00
|
|
|
# may need to be reworked
|
2019-04-26 20:53:04 +00:00
|
|
|
title = self.read_headlines(url)
|
2019-05-02 18:04:10 +00:00
|
|
|
title += tree.xpath(self.xPath["readArticleText"])
|
2019-04-26 20:53:04 +00:00
|
|
|
return title
|
2019-05-02 18:04:10 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class Golem(Site):
|
|
|
|
|
siteName = "golem"
|
|
|
|
|
baseURL = "https://www.golem.de/"
|
|
|
|
|
searchURLString = "https://suche.golem.de/search.php?l=10&q="
|
2019-05-08 14:48:42 +00:00
|
|
|
xPath = dict()
|
|
|
|
|
xPath["searchArticle"] = '//span[@class="dh2 head2"]/text()'
|
|
|
|
|
xPath["searchLinks"] = '//ol[@class="list-articles"]/li/header//@href'
|
|
|
|
|
xPath["newsArticle"] = '//li//h2/text()'
|
|
|
|
|
xPath["newsLinks"] = '//div[@class="g g4"]//header//@href'
|
|
|
|
|
xPath["readHeadlineTitle"] = '//header/h1/span[@class="dh1 head5"]/text()'
|
|
|
|
|
xPath["readHeadlineText"] = '//header/p/text()'
|
|
|
|
|
xPath["readArticleText"] = '//div[@class="formatted"]/p/text()'
|
2019-05-02 18:04:10 +00:00
|
|
|
|
2019-05-07 18:10:28 +00:00
|
|
|
class Spiegel(Site):
|
|
|
|
|
siteName = "spiegel"
|
|
|
|
|
baseURL = "https://www.spiegel.de/"
|
|
|
|
|
searchURLString = "https://www.spiegel.de/suche/?suchbegriff="
|
2019-05-08 14:48:42 +00:00
|
|
|
xPath = dict()
|
|
|
|
|
xPath["searchArticle"] = '//div[@class="search-teaser"]/p/text()'
|
|
|
|
|
xPath["searchLinks"] = '//div[@class="search-teaser"]/p//@href'
|
|
|
|
|
xPath["newsArticle"] = '//div[@class="column-wide pano_xxl"]//div[@class="teaser"]//h2[@class="article-title"]//span[@class="headline"]/text()'
|
|
|
|
|
xPath["newsLinks"] = '//div[@class="column-wide pano_xxl"]//div[@class="teaser"]//h2[@class="article-title"]//@href'
|
|
|
|
|
xPath["readHeadlineTitle"] = '//div[@class="column-both"]//span[@class="headline"]//text()'
|
|
|
|
|
xPath["readHeadlineText"] = '//div[@class="column-both"]/p/strong/text()'
|
|
|
|
|
xPath["readArticleText"] = '//div[@class="article-section clearfix"]/p/text()'
|