added sentiment, removed docker stuff for now

2019-05-21 22:43:57 +02:00 · 2019-05-21 22:43:57 +02:00 · 78054e59d0
parent 44879f10e7
commit 78054e59d0
20 changed files with 51654 additions and 223 deletions
--- a/docker-nginx-reverse-proxy-ssl/Dockerfile
+++ b/docker-nginx-reverse-proxy-ssl/Dockerfile
@ -0,0 +1,17 @@
 FROM nginx:1.7
 # Copy in conf files
 COPY nginx.conf /etc/nginx/nginx.conf
 COPY mime.types /etc/nginx/mime.types
 COPY ssl.conf /etc/nginx/
 COPY site.conf /etc/nginx/sites-enabled/
 # COPY in certs
 COPY ssl.crt /etc/nginx/ssl.crt
 COPY ssl.key /etc/nginx/ssl.key
 # Expose both the HTTP (80) and HTTPS (443) ports
 EXPOSE 80 443
 CMD ["nginx"]
--- a/reader/.vscode/launch.json
+++ b/reader/.vscode/launch.json
@ -0,0 +1,15 @@
 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python: Aktuelle Datei",
            "type": "python",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal"
        }
    ]
 }
--- a/reader/Dockerfile
+++ b/reader/Dockerfile
@ -1,13 +0,0 @@
 FROM tiangolo/uwsgi-nginx-flask:python3.5
 RUN python -m pip install pip==9.0.3
 RUN apt-get update
 RUN apt-get install -y gcc libevent-dev python-dev
 COPY ./requirements.txt /
 COPY ./ /app
 COPY ./nginx.conf /etc/nginx/nginx.conf
 #COPY /app/certificate.pem /etc/nginx/certs
 RUN pip install -r /requirements.txt
--- a/reader/GermanPolarityClues-2012/GermanPolarityClues-Negative-Lemma.tsv
+++ b/reader/GermanPolarityClues-2012/GermanPolarityClues-Negative-Lemma.tsv
--- a/reader/GermanPolarityClues-2012/GermanPolarityClues-Negative.tsv
+++ b/reader/GermanPolarityClues-2012/GermanPolarityClues-Negative.tsv
--- a/reader/GermanPolarityClues-2012/GermanPolarityClues-Neutral-Lemma.tsv
+++ b/reader/GermanPolarityClues-2012/GermanPolarityClues-Neutral-Lemma.tsv
--- a/reader/GermanPolarityClues-2012/GermanPolarityClues-Neutral.tsv
+++ b/reader/GermanPolarityClues-2012/GermanPolarityClues-Neutral.tsv
--- a/reader/GermanPolarityClues-2012/GermanPolarityClues-Positive-Lemma.tsv
+++ b/reader/GermanPolarityClues-2012/GermanPolarityClues-Positive-Lemma.tsv
--- a/reader/GermanPolarityClues-2012/GermanPolarityClues-Positive.tsv
+++ b/reader/GermanPolarityClues-2012/GermanPolarityClues-Positive.tsv
--- a/reader/pycache/siteobj.cpython-35.pyc
+++ b/reader/pycache/siteobj.cpython-35.pyc
--- a/reader/main.py
+++ b/reader/main.py
@ -1,9 +1,7 @@
 import logging
 from OpenSSL import SSL
 import os
 from flask import Flask
 from flask_ask import Ask, request, session, question, statement
 import random
 import yaml
 import siteobj as site2
 import util
@ -158,4 +156,4 @@ if __name__ == '__main__':
    key = os.path.join(os.path.dirname(__file__), 'privkey.pem')
    context = (cer, key)
-    app.run(host='127.0.0.1',port=443,ssl_context=context)
+    app.run(host='127.0.0.1',port=443)
--- a/reader/gen_cert.sh
+++ b/reader/gen_cert.sh
@ -1 +0,0 @@
 openssl req -newkey rsa:2048 -nodes -keyout privkey.pem -x509 -days 365 -out certificate.pem -subj "/C=US/ST=NRW/L=Earth/O=CompanyName/OU=IT/CN=alexa.jopa.dev"
--- a/reader/nginx.conf
+++ b/reader/nginx.conf
@ -1,61 +0,0 @@
 user  nginx;
 worker_processes  1;
 error_log  /var/log/nginx/error.log warn;
 pid        /var/run/nginx.pid;
 events {
    worker_connections  1024;
 }
 http {
    include       /etc/nginx/mime.types;
    default_type  application/octet-stream;
    log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
                      '$status $body_bytes_sent "$http_referer" '
                      '"$http_user_agent" "$http_x_forwarded_for"';
    access_log  /var/log/nginx/access.log  main;
    sendfile        on;
    #tcp_nopush     on;
    keepalive_timeout  65;
    #gzip  on;
    server {
    # enables SSLv3/TLSv1, but not SSLv2 which is weak and should no longer be used.
    ssl_protocols SSLv3 TLSv1;
    # disables all weak ciphers
    ssl_ciphers ALL:!aNULL:!ADH:!eNULL:!LOW:!EXP:RC4+RSA:+HIGH:+MEDIUM;
    server_name alexa.jopa.dev jopa.dev;
    ## Access and error logs.
    access_log /var/log/nginx/access.log;
    error_log  /var/log/nginx/error.log info;
    ## Keep alive timeout set to a greater value for SSL/TLS.
    keepalive_timeout 75 75;
    ## See the keepalive_timeout directive in nginx.conf.
    ## Server certificate and key.
    ssl on;
    ssl_certificate /app/cert/certificate.pem;
    ssl_certificate_key /app/cert/privkey.pem;
    ssl_session_timeout  5m;
    ## Strict Transport Security header for enhanced security. See
    ## http://www.chromium.org/sts. I've set it to 2 hours; set it to
    ## whichever age you want.
    add_header Strict-Transport-Security "max-age=7200";
    }
    include /etc/nginx/conf.d/*.conf;
 }
--- a/reader/requirements.txt
+++ b/reader/requirements.txt
@ -1,3 +0,0 @@
 requests
 flask-ask
 lxml
--- a/reader/sentiment.py
+++ b/reader/sentiment.py
@ -0,0 +1,87 @@
 # http://www.ulliwaltinger.de/sentiment/
 # https://github.com/solariz/german_stopwords
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import nltk
 import copy
 import encodings
 import csv
 from siteobj import *
 from nltk.corpus import treebank
 negatives = dict()
 positives = dict()
 neutrals = dict()
 with open("./reader/GermanPolarityClues-2012/GermanPolarityClues-Negative.tsv", "r", encoding="utf-8") as tsvfile:
  reader = csv.reader(tsvfile, delimiter='\t')
  for row in reader:
        if "-" not in row[4].split("/"):
                negatives[row[0]] = [float(row[4].split("/")[0]), float(row[4].split("/")[1]), float(row[4].split("/")[2])]
 with open("./reader/GermanPolarityClues-2012/GermanPolarityClues-Neutral.tsv", "r", encoding="utf-8") as tsvfile:
  reader = csv.reader(tsvfile, delimiter='\t')
  for row  in reader:
        if "-" not in row[4].split("/"):
                neutrals[row[0]] = [float(row[4].split("/")[0]), float(row[4].split("/")[1]), float(row[4].split("/")[2])]
 with open("./reader/GermanPolarityClues-2012/GermanPolarityClues-Positive.tsv", "r", encoding="utf-8") as tsvfile:
  reader = csv.reader(tsvfile, delimiter='\t')
  for row  in reader:
        if "-" not in row[4].split("/"):
                positives[row[0]] = [float(row[4].split("/")[0]), float(row[4].split("/")[1]), float(row[4].split("/")[2])]
 # get stopwords
 stopwords = []
 with  open("./reader/stopwords.txt", 'r', encoding='utf-8') as f:
        for line in f:
                stopwords.append(line)
 extraSW = [".", ",", "´´", "``", "'", '"', ]
 stopwords += extraSW
 obj = Spiegel()
 NewsText = obj.read_article("https://www.spiegel.de/netzwelt/games/labo-vr-set-von-nintendo-im-test-erst-basteln-dann-staunen-a-1265633.html")
 newText = ""
 for text in NewsText:
        newText += text
 tokens = nltk.word_tokenize(newText)
 toDelete = []
 for token in tokens:
        if token in stopwords:
                toDelete.append(token)
 for token in toDelete:
        while token in tokens:
                tokens.remove(token)
 p = 0
 ne = 0
 nu = 0
 for token in tokens:
        if token in negatives:
                p += negatives[token][0]
                ne += negatives[token][1]
                nu += negatives[token][2]
        elif token in positives:
                p += positives[token][0]
                ne += positives[token][1]
                nu += positives[token][2]
        elif token in neutrals:
                p += neutrals[token][0]
                ne += neutrals[token][1]
                nu += neutrals[token][2]
 total = p + ne + nu
 p /= total
 nu /= total
 ne /= total
 print(p, nu, ne)
--- a/reader/siteobj.py
+++ b/reader/siteobj.py
@ -1,8 +1,6 @@
 import urllib.request,urllib.parse,urllib.error
 from lxml import html
 import requests
 import re
 class Site:
    siteName = ""     
--- a/reader/stopwords.txt
+++ b/reader/stopwords.txt
--- a/reader/tests.py
+++ b/reader/tests.py
@ -1,5 +0,0 @@
 import siteobj as site2
 obj = site2.Golem()
 news = obj.get_news()
--- a/reader/urlchecker.py
+++ b/reader/urlchecker.py
@ -1,135 +0,0 @@
 import urllib.request,urllib.parse,urllib.error
 from urllib.error import HTTPError
 from urllib.error import URLError
 from urllib.parse import urljoin
 import requests
 import re
 import networkx as nx
 import matplotlib.pyplot as plt
 class url:
    url = ""                # the url of the website to be checked
    sites = dict()          # dic. with all sites and urls on those sites
    does_work = []          # array with all prev. positiv tested urls
    does_not_work = dict()  # dic. with all not working urls and the site that linked there
    header_values = {
                'Connection:' : 'Keep-alive',
                'name' : 'Michael Foord',
                'location' : 'Northampton',
                'language' : 'English',
                'User-Agent': 'Mozilla 4/0'}
    def __init__(self, url):
        self.url = urllib.request.urlopen(url).geturl()
    def make_url(self, link, start):
        ret_link = urljoin(start, link)
        return ret_link
    def test_url(self, link, root):
        if link in self.sites or link in self.does_work:
            return True
        elif link in self.does_not_work:
            return False
        else:
            try:
                header = urllib.parse.urlencode(self.header_values)
                header=header.encode('ascii')
                request = urllib.request.Request(link, header)
                response = urllib.request.urlopen(request)
                self.does_work.append(link)
                print(" works " + link)
                return True
            except (urllib.error.HTTPError, urllib.error.URLError, ValueError): 
                self.does_not_work[link]=root
                print(" doesn't work " + link)
                return False
    def get_actual_urls(self, links, root):
        temp_links = []
        for each_link in links:
            if each_link.startswith("http") | each_link.startswith("//"):
                temp_links.append(each_link)
            else:
                temp_links.append(urljoin(root, each_link)) 
        for each_temp_link in temp_links:
            self.test_url(each_temp_link, root)
        return temp_links    
    def run_check(self, root=None):      # root is the url of the current Site
        if root == None:
            root = self.url
        else:
            pass
        if root in self.sites or self.url.rsplit('/', 1)[0] not in root or not self.test_url(root, root):
            return  
        header = urllib.parse.urlencode(self.header_values)
        header=header.encode('ascii')
        request = urllib.request.Request(root, header)
        http_response = urllib.request.urlopen(request)
        root = http_response.geturl()
        response_data= http_response.read()
        links = re.findall(r'href="(.*?)"' , str(response_data))
        links = self.get_actual_urls(links, root)   
        self.sites[root]=links
        for each_link in links:         
            self.run_check(each_link)
    def graph(self):
        G = nx.Graph(self.sites)
        label_dict = {}
        for key, value in self.sites.items():               #that's not how it works... todo: later
            label_dict[key]=self.remove_root(value)
        nx.draw(G, with_labels=True, font_size=8 , node_size=1000, node_color="skyblue", edge_color='#A0FFA2', pos=nx.spring_layout(G))
        plt.show()  
    def remove_root(self, links):
        ret_links = []
        for link in links: 
            ret_links.append(link.rsplit('.', 1)[0])
        return ret_links
    def clean(self):
        self.sites.clear()
        self.does_not_work.clear()
        self.does_work.clear()
--- a/7
+++ b/7
@ -0,0 +1,7 @@
 germalemma
 flask
 flask-ask
 nltk
 lxml
 urllib
 yaml
		`@ -1 +0,0 @@`
			`openssl req -newkey rsa:2048 -nodes -keyout privkey.pem -x509 -days 365 -out certificate.pem -subj "/C=US/ST=NRW/L=Earth/O=CompanyName/OU=IT/CN=alexa.jopa.dev"`