Alexa-news-stentiment-evalu.../reader/siteobj.py

import urllib.request,urllib.parse,urllib.error
from lxml import html
import requests
import re


class Site:
    siteName = ""     
    baseURL = ""
    searchURLString = ""
    xPath = dict()
    xPath["searchArticle"] = ""
    xPath["searchLinks"] = ""
    xPath["newsArticle"] = ""
    xPath["newsLinks"] = ""
    xPath["readHeadlineTitle"] = ""
    xPath["readHeadlineText"] = ""
    xPath["readArticleText"] = ""
    
    def __init__(self):
        return None

    def search_article(self, topic):
        searchURL = self.searchURLString + topic.replace(" ", "+")
        site = requests.get(searchURL)
        tree = html.fromstring(site.content)
        
        articles = tree.xpath(self.xPath["searchArticle"])
        links = tree.xpath(self.xPath["searchLinks"])
        return articles, links
    
    def get_news(self):
        searchURL = self.baseURL
        site = requests.get(searchURL)
        tree = html.fromstring(site.content)

        articles = tree.xpath(self.xPath["newsArticle"])
        links = tree.xpath(self.xPath["newsLinks"])
        return articles, links

    def read_headlines(self, url):
        site = requests.get(url)
        tree = html.fromstring(site.content)

        title = tree.xpath(self.xPath["readHeadlineTitle"] )
        title += tree.xpath(self.xPath["readHeadlineText"])
        return title

    # not used, who wants to listen to alexa for 10 minutes?
    def read_article(self, url):
        site = requests.get(url)
        tree = html.fromstring(site.content)

        # may need to be reworked
        title = self.read_headlines(url)
        title += tree.xpath(self.xPath["readArticleText"])
        return title
    

class Golem(Site):
    siteName = "golem"
    baseURL = "https://www.golem.de/"   
    searchURLString = "https://suche.golem.de/search.php?l=10&q="
    xPath = dict()
    xPath["searchArticle"] = '//span[@class="dh2 head2"]/text()'
    xPath["searchLinks"] = '//ol[@class="list-articles"]/li/header//@href'
    xPath["newsArticle"] = '//li//h2/text()'
    xPath["newsLinks"] = '//div[@class="g g4"]//header//@href'
    xPath["readHeadlineTitle"] = '//header/h1/span[@class="dh1 head5"]/text()'
    xPath["readHeadlineText"] = '//header/p/text()'
    xPath["readArticleText"] = '//div[@class="formatted"]/p/text()'

class Spiegel(Site):
    siteName = "spiegel"
    baseURL = "https://www.spiegel.de/"   
    searchURLString = "https://www.spiegel.de/suche/?suchbegriff="
    xPath = dict()
    xPath["searchArticle"] = '//div[@class="search-teaser"]/p/text()'
    xPath["searchLinks"] = '//div[@class="search-teaser"]/p//@href'
    xPath["newsArticle"] = '//div[@class="column-wide pano_xxl"]//div[@class="teaser"]//h2[@class="article-title"]//span[@class="headline"]/text()'
    xPath["newsLinks"] = '//div[@class="column-wide pano_xxl"]//div[@class="teaser"]//h2[@class="article-title"]//@href'
    xPath["readHeadlineTitle"] = '//div[@class="column-both"]//span[@class="headline"]//text()'
    xPath["readHeadlineText"] = '//div[@class="column-both"]/p/strong/text()'
    xPath["readArticleText"] = '//div[@class="article-section clearfix"]/p/text()'
TODO: Output 2019-04-26 20:53:04 +00:00			`import urllib.request,urllib.parse,urllib.error`
			`from lxml import html`
			`import requests`
			`import re`


			`class Site:`
redone site object and working dialog for news and search 2019-05-02 18:04:10 +00:00			`siteName = ""`
			`baseURL = ""`
			`searchURLString = ""`
			`xPath = dict()`
			`xPath["searchArticle"] = ""`
			`xPath["searchLinks"] = ""`
			`xPath["newsArticle"] = ""`
spiegel works / golem not 2019-05-07 18:10:28 +00:00			`xPath["newsLinks"] = ""`
redone site object and working dialog for news and search 2019-05-02 18:04:10 +00:00			`xPath["readHeadlineTitle"] = ""`
			`xPath["readHeadlineText"] = ""`
			`xPath["readArticleText"] = ""`

TODO: Output 2019-04-26 20:53:04 +00:00			`def __init__(self):`
			`return None`

			`def search_article(self, topic):`
redone site object and working dialog for news and search 2019-05-02 18:04:10 +00:00			`searchURL = self.searchURLString + topic.replace(" ", "+")`
TODO: Output 2019-04-26 20:53:04 +00:00			`site = requests.get(searchURL)`
			`tree = html.fromstring(site.content)`

redone site object and working dialog for news and search 2019-05-02 18:04:10 +00:00			`articles = tree.xpath(self.xPath["searchArticle"])`
			`links = tree.xpath(self.xPath["searchLinks"])`
TODO: Output 2019-04-26 20:53:04 +00:00			`return articles, links`

			`def get_news(self):`
redone site object and working dialog for news and search 2019-05-02 18:04:10 +00:00			`searchURL = self.baseURL`
TODO: Output 2019-04-26 20:53:04 +00:00			`site = requests.get(searchURL)`
			`tree = html.fromstring(site.content)`

redone site object and working dialog for news and search 2019-05-02 18:04:10 +00:00			`articles = tree.xpath(self.xPath["newsArticle"])`
spiegel works / golem not 2019-05-07 18:10:28 +00:00			`links = tree.xpath(self.xPath["newsLinks"])`
			`return articles, links`
TODO: Output 2019-04-26 20:53:04 +00:00
			`def read_headlines(self, url):`
			`site = requests.get(url)`
			`tree = html.fromstring(site.content)`

redone site object and working dialog for news and search 2019-05-02 18:04:10 +00:00			`title = tree.xpath(self.xPath["readHeadlineTitle"] )`
			`title += tree.xpath(self.xPath["readHeadlineText"])`
TODO: Output 2019-04-26 20:53:04 +00:00			`return title`

refactored 2019-05-13 09:53:13 +00:00			`# not used, who wants to listen to alexa for 10 minutes?`
TODO: Output 2019-04-26 20:53:04 +00:00			`def read_article(self, url):`
			`site = requests.get(url)`
			`tree = html.fromstring(site.content)`

refactored 2019-05-13 09:53:13 +00:00			`# may need to be reworked`
TODO: Output 2019-04-26 20:53:04 +00:00			`title = self.read_headlines(url)`
redone site object and working dialog for news and search 2019-05-02 18:04:10 +00:00			`title += tree.xpath(self.xPath["readArticleText"])`
TODO: Output 2019-04-26 20:53:04 +00:00			`return title`
redone site object and working dialog for news and search 2019-05-02 18:04:10 +00:00

			`class Golem(Site):`
			`siteName = "golem"`
			`baseURL = "https://www.golem.de/"`
			`searchURLString = "https://suche.golem.de/search.php?l=10&q="`
golem + spiegel works TODO: bewerten 2019-05-08 14:48:42 +00:00			`xPath = dict()`
			`xPath["searchArticle"] = '//span[@class="dh2 head2"]/text()'`
			`xPath["searchLinks"] = '//ol[@class="list-articles"]/li/header//@href'`
			`xPath["newsArticle"] = '//li//h2/text()'`
			`xPath["newsLinks"] = '//div[@class="g g4"]//header//@href'`
			`xPath["readHeadlineTitle"] = '//header/h1/span[@class="dh1 head5"]/text()'`
			`xPath["readHeadlineText"] = '//header/p/text()'`
			`xPath["readArticleText"] = '//div[@class="formatted"]/p/text()'`
redone site object and working dialog for news and search 2019-05-02 18:04:10 +00:00
spiegel works / golem not 2019-05-07 18:10:28 +00:00			`class Spiegel(Site):`
			`siteName = "spiegel"`
			`baseURL = "https://www.spiegel.de/"`
			`searchURLString = "https://www.spiegel.de/suche/?suchbegriff="`
golem + spiegel works TODO: bewerten 2019-05-08 14:48:42 +00:00			`xPath = dict()`
			`xPath["searchArticle"] = '//div[@class="search-teaser"]/p/text()'`
			`xPath["searchLinks"] = '//div[@class="search-teaser"]/p//@href'`
			`xPath["newsArticle"] = '//div[@class="column-wide pano_xxl"]//div[@class="teaser"]//h2[@class="article-title"]//span[@class="headline"]/text()'`
			`xPath["newsLinks"] = '//div[@class="column-wide pano_xxl"]//div[@class="teaser"]//h2[@class="article-title"]//@href'`
			`xPath["readHeadlineTitle"] = '//div[@class="column-both"]//span[@class="headline"]//text()'`
			`xPath["readHeadlineText"] = '//div[@class="column-both"]/p/strong/text()'`
			`xPath["readArticleText"] = '//div[@class="article-section clearfix"]/p/text()'`