golem + spiegel works

TODO: bewerten
This commit is contained in:
Patrice 2019-05-08 16:48:42 +02:00
parent 1b71fb1c00
commit 22be23224f
4 changed files with 31 additions and 40 deletions

View File

@ -14,7 +14,6 @@ logging.getLogger('flask_ask').setLevel(logging.DEBUG)
@ask.intent('searchon', mapping={'site': 'Site'}, default={'site': 'golem'}) @ask.intent('searchon', mapping={'site': 'Site'}, default={'site': 'golem'})
def search_on(site): def search_on(site):
try: try:
session.attributes["siteName"] = site session.attributes["siteName"] = site
print(session.attributes["siteName"]) print(session.attributes["siteName"])
except: except:
@ -38,7 +37,7 @@ def search_for(searchTerm):
if site == "golem": if site == "golem":
obj = site2.Golem() obj = site2.Golem()
elif site == "spiegel": elif site.lower() == "spiegel":
obj = site2.Spiegel() obj = site2.Spiegel()
elif site is None: elif site is None:
session.attributes["searchTerm"] = searchTerm session.attributes["searchTerm"] = searchTerm
@ -64,14 +63,13 @@ def search_for(searchTerm):
@ask.intent('News', mapping={'site': 'Site'}, default={'site': ''}) @ask.intent('News', mapping={'site': 'Site'}, default={'site': ''})
def news(site): def news(site):
try: try:
site = site.lower()
session.attributes["siteName"] = site session.attributes["siteName"] = site
except: except:
print("error") print("error")
print(site) print(site)
if site == "golem": if site == "golem":
obj = site2.Golem() obj = site2.Golem()
elif site == "spiegel": elif site.lower() == "spiegel":
obj = site2.Spiegel() obj = site2.Spiegel()
elif site == '': elif site == '':
session.attributes["lastCall"] = "news" session.attributes["lastCall"] = "news"
@ -96,25 +94,25 @@ def search_answer(number):
site = session.attributes["siteName"] site = session.attributes["siteName"]
except: except:
site = None site = None
print(number)
if site == "golem": if site == "golem":
obj = site2.Golem() obj = site2.Golem()
elif site == "spiegel": elif site.lower() == "spiegel":
obj = site2.Spiegel() obj = site2.Spiegel()
links = session.attributes["lastSearch"] links = session.attributes["lastSearch"]
if "http" not in str(links):
newLinks = [] newLinks = []
for link in links: for link in links:
if "http" not in link: if "http" not in link:
newLinks.append(obj.baseURL + link) newLinks.append(obj.baseURL + link)
links = newLinks links = newLinks
art = obj.read_headlines(links[int(number)-1]) art = obj.read_headlines(links[int(number)-1])
response = "" response = ""
for element in art: for element in art:
response += element + " " response += element
print(links)
session.attributes["lastCall"] = "search2" session.attributes["lastCall"] = "search2"
return statement(response) return statement(response)

View File

@ -59,24 +59,24 @@ class Golem(Site):
siteName = "golem" siteName = "golem"
baseURL = "https://www.golem.de/" baseURL = "https://www.golem.de/"
searchURLString = "https://suche.golem.de/search.php?l=10&q=" searchURLString = "https://suche.golem.de/search.php?l=10&q="
xPath = dict()
Site.xPath["searchArticle"] = '//span[@class="dh2 head2"]/text()' xPath["searchArticle"] = '//span[@class="dh2 head2"]/text()'
Site.xPath["searchLinks"] = '//ol[@class="list-articles"]/li/header//@href' xPath["searchLinks"] = '//ol[@class="list-articles"]/li/header//@href'
Site.xPath["newsArticle"] = '//li//h2/text()' xPath["newsArticle"] = '//li//h2/text()'
Site.xPath["newsLinks"] = '//header[@class="cluster-header"]//@href' xPath["newsLinks"] = '//div[@class="g g4"]//header//@href'
Site.xPath["readHeadlineTitle"] = '//header/h1/span[@class="dh1 head5"]/text()' xPath["readHeadlineTitle"] = '//header/h1/span[@class="dh1 head5"]/text()'
Site.xPath["readHeadlineText"] = '//header/p/text()' xPath["readHeadlineText"] = '//header/p/text()'
Site.xPath["readArticleText"] = '//div[@class="formatted"]/p/text()' xPath["readArticleText"] = '//div[@class="formatted"]/p/text()'
class Spiegel(Site): class Spiegel(Site):
siteName = "spiegel" siteName = "spiegel"
baseURL = "https://www.spiegel.de/" baseURL = "https://www.spiegel.de/"
searchURLString = "https://www.spiegel.de/suche/?suchbegriff=" searchURLString = "https://www.spiegel.de/suche/?suchbegriff="
xPath = dict()
Site.xPath["searchArticle"] = '//span[@class="dh2 head2"]/text()' xPath["searchArticle"] = '//div[@class="search-teaser"]/p/text()'
Site.xPath["searchLinks"] = '//ol[@class="list-articles"]/li/header//@href' xPath["searchLinks"] = '//div[@class="search-teaser"]/p//@href'
Site.xPath["newsArticle"] = '//div[@class="column-wide pano_xxl"]//div[@class="teaser"]//h2[@class="article-title"]//span[@class="headline"]/text()' xPath["newsArticle"] = '//div[@class="column-wide pano_xxl"]//div[@class="teaser"]//h2[@class="article-title"]//span[@class="headline"]/text()'
Site.xPath["newsLinks"] = '//div[@class="column-wide pano_xxl"]//div[@class="teaser"]//h2[@class="article-title"]//@href' xPath["newsLinks"] = '//div[@class="column-wide pano_xxl"]//div[@class="teaser"]//h2[@class="article-title"]//@href'
Site.xPath["readHeadlineTitle"] = '//div[@class="column-both"]//span[@class="headline"]//text()' xPath["readHeadlineTitle"] = '//div[@class="column-both"]//span[@class="headline"]//text()'
Site.xPath["readHeadlineText"] = '//div[@class="column-both"]/p/strong/text()' xPath["readHeadlineText"] = '//div[@class="column-both"]/p/strong/text()'
Site.xPath["readArticleText"] = '//div[@class="formatted"]/p/text()' xPath["readArticleText"] = '//div[@class="article-section clearfix"]/p/text()'

View File

@ -1,12 +1,5 @@
import urllib.request,urllib.parse,urllib.error import siteobj as site2
from lxml import html
import requests
import re
searchURL = "https://suche.golem.de/search.php?l=10&q=gaming"
site = requests.get(searchURL)
tree = html.fromstring(site.content)
articles = tree.xpath('//span[@class="dh2 head2"]/text()') obj = site2.Golem()
links = tree.xpath('//ol[@class="list-articles"]/li/header//@href') news = obj.get_news()
print(len(articles), len(links))