added core func for Golem

2019-04-22 17:06:20 +02:00 · 2019-04-22 17:06:20 +02:00 · 401d3bd8ba
parent 517db5be29
commit 401d3bd8ba
95 changed files with 206 additions and 0 deletions
--- a/f-ask/.gitignore
+++ b/f-ask/.gitignore
--- a/f-ask/.vscode/launch.json
+++ b/f-ask/.vscode/launch.json
--- a/f-ask/LICENSE.txt
+++ b/f-ask/LICENSE.txt
--- a/f-ask/MANIFEST.in
+++ b/f-ask/MANIFEST.in
--- a/f-ask/README.md
+++ b/f-ask/README.md
--- a/f-ask/README.rst
+++ b/f-ask/README.rst
--- a/f-ask/docs/Makefile
+++ b/f-ask/docs/Makefile
--- a/f-ask/docs/_static/logo-full.png
+++ b/f-ask/docs/_static/logo-full.png
--- a/f-ask/docs/_static/logo-sm.png
+++ b/f-ask/docs/_static/logo-sm.png
--- a/f-ask/docs/_templates/links.html
+++ b/f-ask/docs/_templates/links.html
--- a/f-ask/docs/_templates/sidebarlogo.html
+++ b/f-ask/docs/_templates/sidebarlogo.html
--- a/f-ask/docs/_templates/stayinformed.html
+++ b/f-ask/docs/_templates/stayinformed.html
--- a/f-ask/docs/_themes/LICENSE
+++ b/f-ask/docs/_themes/LICENSE
--- a/f-ask/docs/_themes/README
+++ b/f-ask/docs/_themes/README
--- a/f-ask/docs/_themes/flask/layout.html
+++ b/f-ask/docs/_themes/flask/layout.html
--- a/f-ask/docs/_themes/flask/relations.html
+++ b/f-ask/docs/_themes/flask/relations.html
--- a/f-ask/docs/_themes/flask/static/flasky.css_t
+++ b/f-ask/docs/_themes/flask/static/flasky.css_t
--- a/f-ask/docs/_themes/flask/theme.conf
+++ b/f-ask/docs/_themes/flask/theme.conf
--- a/f-ask/docs/_themes/flask_theme_support.py
+++ b/f-ask/docs/_themes/flask_theme_support.py
--- a/f-ask/docs/conf.py
+++ b/f-ask/docs/conf.py
--- a/f-ask/docs/configuration.rst
+++ b/f-ask/docs/configuration.rst
--- a/f-ask/docs/contents.rst.inc
+++ b/f-ask/docs/contents.rst.inc
--- a/f-ask/docs/flaskdocext.py
+++ b/f-ask/docs/flaskdocext.py
--- a/f-ask/docs/getting_started.rst
+++ b/f-ask/docs/getting_started.rst
--- a/f-ask/docs/index.rst
+++ b/f-ask/docs/index.rst
--- a/f-ask/docs/make.bat
+++ b/f-ask/docs/make.bat
--- a/f-ask/docs/requests.rst
+++ b/f-ask/docs/requests.rst
--- a/f-ask/docs/responses.rst
+++ b/f-ask/docs/responses.rst
--- a/f-ask/docs/user_contributions.rst
+++ b/f-ask/docs/user_contributions.rst
--- a/f-ask/flask_ask/init.py
+++ b/f-ask/flask_ask/init.py
--- a/f-ask/flask_ask/cache.py
+++ b/f-ask/flask_ask/cache.py
--- a/f-ask/flask_ask/convert.py
+++ b/f-ask/flask_ask/convert.py
--- a/f-ask/flask_ask/core.py
+++ b/f-ask/flask_ask/core.py
--- a/f-ask/flask_ask/models.py
+++ b/f-ask/flask_ask/models.py
--- a/f-ask/flask_ask/verifier.py
+++ b/f-ask/flask_ask/verifier.py
--- a/f-ask/ngrok.exe
+++ b/f-ask/ngrok.exe
--- a/f-ask/requirements-dev.txt
+++ b/f-ask/requirements-dev.txt
--- a/f-ask/requirements.txt
+++ b/f-ask/requirements.txt
--- a/f-ask/samples/audio/playlist_demo/playlist.py
+++ b/f-ask/samples/audio/playlist_demo/playlist.py
--- a/f-ask/samples/audio/playlist_demo/speech_assets/IntentSchema.json
+++ b/f-ask/samples/audio/playlist_demo/speech_assets/IntentSchema.json
--- a/f-ask/samples/audio/playlist_demo/speech_assets/SampleUtterances.txt
+++ b/f-ask/samples/audio/playlist_demo/speech_assets/SampleUtterances.txt
--- a/f-ask/samples/audio/simple_demo/ask_audio.py
+++ b/f-ask/samples/audio/simple_demo/ask_audio.py
--- a/f-ask/samples/audio/simple_demo/speech_assets/IntentSchema.json
+++ b/f-ask/samples/audio/simple_demo/speech_assets/IntentSchema.json
--- a/f-ask/samples/audio/simple_demo/speech_assets/SampleUtterances.txt
+++ b/f-ask/samples/audio/simple_demo/speech_assets/SampleUtterances.txt
--- a/f-ask/samples/blueprint_demo/demo.py
+++ b/f-ask/samples/blueprint_demo/demo.py
--- a/f-ask/samples/blueprint_demo/helloworld.py
+++ b/f-ask/samples/blueprint_demo/helloworld.py
--- a/f-ask/samples/blueprint_demo/speech_assets/IntentSchema.json
+++ b/f-ask/samples/blueprint_demo/speech_assets/IntentSchema.json
--- a/f-ask/samples/blueprint_demo/speech_assets/SampleUtterances.txt
+++ b/f-ask/samples/blueprint_demo/speech_assets/SampleUtterances.txt
--- a/f-ask/samples/blueprint_demo/templates.yaml
+++ b/f-ask/samples/blueprint_demo/templates.yaml
--- a/f-ask/samples/helloworld/helloworld.py
+++ b/f-ask/samples/helloworld/helloworld.py
--- a/f-ask/samples/helloworld/speech_assets/IntentSchema.json
+++ b/f-ask/samples/helloworld/speech_assets/IntentSchema.json
--- a/f-ask/samples/helloworld/speech_assets/SampleUtterances.txt
+++ b/f-ask/samples/helloworld/speech_assets/SampleUtterances.txt
--- a/f-ask/samples/historybuff/historybuff.py
+++ b/f-ask/samples/historybuff/historybuff.py
--- a/f-ask/samples/historybuff/speech_assets/IntentSchema.json
+++ b/f-ask/samples/historybuff/speech_assets/IntentSchema.json
--- a/f-ask/samples/historybuff/speech_assets/SampleUtterances.txt
+++ b/f-ask/samples/historybuff/speech_assets/SampleUtterances.txt
--- a/f-ask/samples/purchase/IntentSchema.json
+++ b/f-ask/samples/purchase/IntentSchema.json
--- a/f-ask/samples/purchase/model.py
+++ b/f-ask/samples/purchase/model.py
--- a/f-ask/samples/purchase/purchase.py
+++ b/f-ask/samples/purchase/purchase.py
--- a/f-ask/samples/purchase/templates.yaml
+++ b/f-ask/samples/purchase/templates.yaml
--- a/f-ask/samples/reddit/main.py
+++ b/f-ask/samples/reddit/main.py
--- a/f-ask/samples/reddit/speech_assets/IntentSchema.json
+++ b/f-ask/samples/reddit/speech_assets/IntentSchema.json
--- a/f-ask/samples/reddit/speech_assets/SampleUtterances.txt
+++ b/f-ask/samples/reddit/speech_assets/SampleUtterances.txt
--- a/f-ask/samples/reddit/speech_assets/customSlotTypes/NAME_OF_BODY_PARTS
+++ b/f-ask/samples/reddit/speech_assets/customSlotTypes/NAME_OF_BODY_PARTS
--- a/f-ask/samples/reddit/test.py
+++ b/f-ask/samples/reddit/test.py
--- a/f-ask/samples/reddit/workouts.yaml
+++ b/f-ask/samples/reddit/workouts.yaml
--- a/f-ask/samples/session/session.py
+++ b/f-ask/samples/session/session.py
--- a/f-ask/samples/session/speech_assets/IntentSchema.json
+++ b/f-ask/samples/session/speech_assets/IntentSchema.json
--- a/f-ask/samples/session/speech_assets/SampleUtterances.txt
+++ b/f-ask/samples/session/speech_assets/SampleUtterances.txt
--- a/f-ask/samples/session/speech_assets/customSlotTypes/LIST_OF_COLORS
+++ b/f-ask/samples/session/speech_assets/customSlotTypes/LIST_OF_COLORS
--- a/f-ask/samples/session/templates.yaml
+++ b/f-ask/samples/session/templates.yaml
--- a/f-ask/samples/spacegeek/spacegeek.py
+++ b/f-ask/samples/spacegeek/spacegeek.py
--- a/f-ask/samples/spacegeek/speech_assets/IntentSchema.json
+++ b/f-ask/samples/spacegeek/speech_assets/IntentSchema.json
--- a/f-ask/samples/spacegeek/speech_assets/SampleUtterances.txt
+++ b/f-ask/samples/spacegeek/speech_assets/SampleUtterances.txt
--- a/f-ask/samples/spacegeek/templates.yaml
+++ b/f-ask/samples/spacegeek/templates.yaml
--- a/f-ask/samples/tidepooler/speech_assets/IntentSchema.json
+++ b/f-ask/samples/tidepooler/speech_assets/IntentSchema.json
--- a/f-ask/samples/tidepooler/speech_assets/SampleUtterances.txt
+++ b/f-ask/samples/tidepooler/speech_assets/SampleUtterances.txt
--- a/f-ask/samples/tidepooler/speech_assets/customSlotTypes/LIST_OF_CITIES
+++ b/f-ask/samples/tidepooler/speech_assets/customSlotTypes/LIST_OF_CITIES
--- a/f-ask/samples/tidepooler/speech_assets/customSlotTypes/LIST_OF_STATES
+++ b/f-ask/samples/tidepooler/speech_assets/customSlotTypes/LIST_OF_STATES
--- a/f-ask/samples/tidepooler/templates.yaml
+++ b/f-ask/samples/tidepooler/templates.yaml
--- a/f-ask/samples/tidepooler/tidepooler.py
+++ b/f-ask/samples/tidepooler/tidepooler.py
--- a/f-ask/setup.cfg
+++ b/f-ask/setup.cfg
--- a/f-ask/setup.py
+++ b/f-ask/setup.py
--- a/f-ask/tests/init.py
+++ b/f-ask/tests/init.py
--- a/f-ask/tests/test_audio.py
+++ b/f-ask/tests/test_audio.py
--- a/f-ask/tests/test_cache.py
+++ b/f-ask/tests/test_cache.py
--- a/f-ask/tests/test_core.py
+++ b/f-ask/tests/test_core.py
--- a/f-ask/tests/test_integration.py
+++ b/f-ask/tests/test_integration.py
--- a/f-ask/tests/test_integration_support_entity_resolution.py
+++ b/f-ask/tests/test_integration_support_entity_resolution.py
--- a/f-ask/tests/test_samples.py
+++ b/f-ask/tests/test_samples.py
--- a/f-ask/tests/test_unicode.py
+++ b/f-ask/tests/test_unicode.py
--- a/f-ask/tox.ini
+++ b/f-ask/tox.ini
--- a/reader/main.py
+++ b/reader/main.py
--- a/reader/site.py
+++ b/reader/site.py
@ -0,0 +1,61 @@
 import urllib.request,urllib.parse,urllib.error
 from lxml import html
 import requests
 import re
 class site:
    url = ""     
    header_values = {
            'Connection:' : 'Keep-alive',
            'name' : 'Michael Foord',
            'location' : 'Northampton',
            'language' : 'German',
            'User-Agent': 'Mozilla 4/0'}
    def __init__(self, url):
        self.url = urllib.request.urlopen(url).geturl()
    def search_article(self, topic):
        return False
    def get_news(self):
        return False
    def read_article(self, url):
        return False
    def read_headlines(self, url):
        return False
 class Golem(site):
    def search_article(self, topic):
        searchURL = "https://suche.golem.de/search.php?l=10&q=" + topic.replace(" ", "+")
        site = requests.get(searchURL, headers=self.header_values)
        tree = html.fromstring(site.content)
        articles = tree.xpath('//span[@class="dh2 head2"]/text()')
        return articles
    def get_news(self):
        searchURL = "https://www.golem.de/"
        site = requests.get(searchURL, headers=self.header_values)
        tree = html.fromstring(site.content)
        articles = tree.xpath('//h2[@class="head2"]/text()')
        return articles
    def read_headlines(self, url):
        site = requests.get(url, headers=self.header_values)
        tree = html.fromstring(site.content)
        title = tree.xpath('//header/h1/span[@class="dh1 head5"]/text()')
        title += tree.xpath('//header/p/text()')
        return title
    def read_article(self, url):
        site = requests.get(url, headers=self.header_values)
        tree = html.fromstring(site.content)
        title = self.read_headlines(url)
        title += tree.xpath('//div[@class="formatted"]/p/text()')
        return title
--- a/reader/tests.py
+++ b/reader/tests.py
@ -0,0 +1,10 @@
 import urllib.request,urllib.parse,urllib.error
 from lxml import html
 import requests
 import re
 url="https://www.golem.de/news/tchap-forscher-gelingt-anmeldung-im-regierungschat-frankreichs-1904-140799.html"
 site = requests.get(url)
 tree = html.fromstring(site.content)
 title = tree.xpath('//div[@class="formatted"]/p/text()')
 print(title)
--- a/reader/urlchecker.py
+++ b/reader/urlchecker.py
@ -0,0 +1,135 @@
 import urllib.request,urllib.parse,urllib.error
 from urllib.error import HTTPError
 from urllib.error import URLError
 from urllib.parse import urljoin
 import requests
 import re
 import networkx as nx
 import matplotlib.pyplot as plt
 class url:
    url = ""                # the url of the website to be checked
    sites = dict()          # dic. with all sites and urls on those sites
    does_work = []          # array with all prev. positiv tested urls
    does_not_work = dict()  # dic. with all not working urls and the site that linked there
    header_values = {
                'Connection:' : 'Keep-alive',
                'name' : 'Michael Foord',
                'location' : 'Northampton',
                'language' : 'English',
                'User-Agent': 'Mozilla 4/0'}
    def __init__(self, url):
        self.url = urllib.request.urlopen(url).geturl()
    def make_url(self, link, start):
        ret_link = urljoin(start, link)
        return ret_link
    def test_url(self, link, root):
        if link in self.sites or link in self.does_work:
            return True
        elif link in self.does_not_work:
            return False
        else:
            try:
                header = urllib.parse.urlencode(self.header_values)
                header=header.encode('ascii')
                request = urllib.request.Request(link, header)
                response = urllib.request.urlopen(request)
                self.does_work.append(link)
                print(" works " + link)
                return True
            except (urllib.error.HTTPError, urllib.error.URLError, ValueError): 
                self.does_not_work[link]=root
                print(" doesn't work " + link)
                return False
    def get_actual_urls(self, links, root):
        temp_links = []
        for each_link in links:
            if each_link.startswith("http") | each_link.startswith("//"):
                temp_links.append(each_link)
            else:
                temp_links.append(urljoin(root, each_link)) 
        for each_temp_link in temp_links:
            self.test_url(each_temp_link, root)
        return temp_links    
    def run_check(self, root=None):      # root is the url of the current Site
        if root == None:
            root = self.url
        else:
            pass
        if root in self.sites or self.url.rsplit('/', 1)[0] not in root or not self.test_url(root, root):
            return  
        header = urllib.parse.urlencode(self.header_values)
        header=header.encode('ascii')
        request = urllib.request.Request(root, header)
        http_response = urllib.request.urlopen(request)
        root = http_response.geturl()
        response_data= http_response.read()
        links = re.findall(r'href="(.*?)"' , str(response_data))
        links = self.get_actual_urls(links, root)   
        self.sites[root]=links
        for each_link in links:         
            self.run_check(each_link)
    def graph(self):
        G = nx.Graph(self.sites)
        label_dict = {}
        for key, value in self.sites.items():               #that's not how it works... todo: later
            label_dict[key]=self.remove_root(value)
        nx.draw(G, with_labels=True, font_size=8 , node_size=1000, node_color="skyblue", edge_color='#A0FFA2', pos=nx.spring_layout(G))
        plt.show()  
    def remove_root(self, links):
        ret_links = []
        for link in links: 
            ret_links.append(link.rsplit('.', 1)[0])
        return ret_links
    def clean(self):
        self.sites.clear()
        self.does_not_work.clear()
        self.does_work.clear()