added sentiment, removed docker stuff for now

This commit is contained in:
Patrice 2019-05-21 22:43:57 +02:00
parent 44879f10e7
commit 78054e59d0
20 changed files with 51654 additions and 223 deletions

View File

@ -0,0 +1,17 @@
FROM nginx:1.7
# Copy in conf files
COPY nginx.conf /etc/nginx/nginx.conf
COPY mime.types /etc/nginx/mime.types
COPY ssl.conf /etc/nginx/
COPY site.conf /etc/nginx/sites-enabled/
# COPY in certs
COPY ssl.crt /etc/nginx/ssl.crt
COPY ssl.key /etc/nginx/ssl.key
# Expose both the HTTP (80) and HTTPS (443) ports
EXPOSE 80 443
CMD ["nginx"]

15
reader/.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,15 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Aktuelle Datei",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}

View File

@ -1,13 +0,0 @@
FROM tiangolo/uwsgi-nginx-flask:python3.5
RUN python -m pip install pip==9.0.3
RUN apt-get update
RUN apt-get install -y gcc libevent-dev python-dev
COPY ./requirements.txt /
COPY ./ /app
COPY ./nginx.conf /etc/nginx/nginx.conf
#COPY /app/certificate.pem /etc/nginx/certs
RUN pip install -r /requirements.txt

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,7 @@
import logging
from OpenSSL import SSL
import os
from flask import Flask
from flask_ask import Ask, request, session, question, statement
import random
import yaml
import siteobj as site2
import util
@ -158,4 +156,4 @@ if __name__ == '__main__':
key = os.path.join(os.path.dirname(__file__), 'privkey.pem')
context = (cer, key)
app.run(host='127.0.0.1',port=443,ssl_context=context)
app.run(host='127.0.0.1',port=443)

View File

@ -1 +0,0 @@
openssl req -newkey rsa:2048 -nodes -keyout privkey.pem -x509 -days 365 -out certificate.pem -subj "/C=US/ST=NRW/L=Earth/O=CompanyName/OU=IT/CN=alexa.jopa.dev"

View File

@ -1,61 +0,0 @@
user nginx;
worker_processes 1;
error_log /var/log/nginx/error.log warn;
pid /var/run/nginx.pid;
events {
worker_connections 1024;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for"';
access_log /var/log/nginx/access.log main;
sendfile on;
#tcp_nopush on;
keepalive_timeout 65;
#gzip on;
server {
# enables SSLv3/TLSv1, but not SSLv2 which is weak and should no longer be used.
ssl_protocols SSLv3 TLSv1;
# disables all weak ciphers
ssl_ciphers ALL:!aNULL:!ADH:!eNULL:!LOW:!EXP:RC4+RSA:+HIGH:+MEDIUM;
server_name alexa.jopa.dev jopa.dev;
## Access and error logs.
access_log /var/log/nginx/access.log;
error_log /var/log/nginx/error.log info;
## Keep alive timeout set to a greater value for SSL/TLS.
keepalive_timeout 75 75;
## See the keepalive_timeout directive in nginx.conf.
## Server certificate and key.
ssl on;
ssl_certificate /app/cert/certificate.pem;
ssl_certificate_key /app/cert/privkey.pem;
ssl_session_timeout 5m;
## Strict Transport Security header for enhanced security. See
## http://www.chromium.org/sts. I've set it to 2 hours; set it to
## whichever age you want.
add_header Strict-Transport-Security "max-age=7200";
}
include /etc/nginx/conf.d/*.conf;
}

View File

@ -1,3 +0,0 @@
requests
flask-ask
lxml

87
reader/sentiment.py Normal file
View File

@ -0,0 +1,87 @@
# http://www.ulliwaltinger.de/sentiment/
# https://github.com/solariz/german_stopwords
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import nltk
import copy
import encodings
import csv
from siteobj import *
from nltk.corpus import treebank
negatives = dict()
positives = dict()
neutrals = dict()
with open("./reader/GermanPolarityClues-2012/GermanPolarityClues-Negative.tsv", "r", encoding="utf-8") as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
if "-" not in row[4].split("/"):
negatives[row[0]] = [float(row[4].split("/")[0]), float(row[4].split("/")[1]), float(row[4].split("/")[2])]
with open("./reader/GermanPolarityClues-2012/GermanPolarityClues-Neutral.tsv", "r", encoding="utf-8") as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
if "-" not in row[4].split("/"):
neutrals[row[0]] = [float(row[4].split("/")[0]), float(row[4].split("/")[1]), float(row[4].split("/")[2])]
with open("./reader/GermanPolarityClues-2012/GermanPolarityClues-Positive.tsv", "r", encoding="utf-8") as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
if "-" not in row[4].split("/"):
positives[row[0]] = [float(row[4].split("/")[0]), float(row[4].split("/")[1]), float(row[4].split("/")[2])]
# get stopwords
stopwords = []
with open("./reader/stopwords.txt", 'r', encoding='utf-8') as f:
for line in f:
stopwords.append(line)
extraSW = [".", ",", "´´", "``", "'", '"', ]
stopwords += extraSW
obj = Spiegel()
NewsText = obj.read_article("https://www.spiegel.de/netzwelt/games/labo-vr-set-von-nintendo-im-test-erst-basteln-dann-staunen-a-1265633.html")
newText = ""
for text in NewsText:
newText += text
tokens = nltk.word_tokenize(newText)
toDelete = []
for token in tokens:
if token in stopwords:
toDelete.append(token)
for token in toDelete:
while token in tokens:
tokens.remove(token)
p = 0
ne = 0
nu = 0
for token in tokens:
if token in negatives:
p += negatives[token][0]
ne += negatives[token][1]
nu += negatives[token][2]
elif token in positives:
p += positives[token][0]
ne += positives[token][1]
nu += positives[token][2]
elif token in neutrals:
p += neutrals[token][0]
ne += neutrals[token][1]
nu += neutrals[token][2]
total = p + ne + nu
p /= total
nu /= total
ne /= total
print(p, nu, ne)

View File

@ -1,8 +1,6 @@
import urllib.request,urllib.parse,urllib.error
from lxml import html
import requests
import re
class Site:
siteName = ""

1855
reader/stopwords.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +0,0 @@
import siteobj as site2
obj = site2.Golem()
news = obj.get_news()

View File

@ -1,135 +0,0 @@
import urllib.request,urllib.parse,urllib.error
from urllib.error import HTTPError
from urllib.error import URLError
from urllib.parse import urljoin
import requests
import re
import networkx as nx
import matplotlib.pyplot as plt
class url:
url = "" # the url of the website to be checked
sites = dict() # dic. with all sites and urls on those sites
does_work = [] # array with all prev. positiv tested urls
does_not_work = dict() # dic. with all not working urls and the site that linked there
header_values = {
'Connection:' : 'Keep-alive',
'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'English',
'User-Agent': 'Mozilla 4/0'}
def __init__(self, url):
self.url = urllib.request.urlopen(url).geturl()
def make_url(self, link, start):
ret_link = urljoin(start, link)
return ret_link
def test_url(self, link, root):
if link in self.sites or link in self.does_work:
return True
elif link in self.does_not_work:
return False
else:
try:
header = urllib.parse.urlencode(self.header_values)
header=header.encode('ascii')
request = urllib.request.Request(link, header)
response = urllib.request.urlopen(request)
self.does_work.append(link)
print(" works " + link)
return True
except (urllib.error.HTTPError, urllib.error.URLError, ValueError):
self.does_not_work[link]=root
print(" doesn't work " + link)
return False
def get_actual_urls(self, links, root):
temp_links = []
for each_link in links:
if each_link.startswith("http") | each_link.startswith("//"):
temp_links.append(each_link)
else:
temp_links.append(urljoin(root, each_link))
for each_temp_link in temp_links:
self.test_url(each_temp_link, root)
return temp_links
def run_check(self, root=None): # root is the url of the current Site
if root == None:
root = self.url
else:
pass
if root in self.sites or self.url.rsplit('/', 1)[0] not in root or not self.test_url(root, root):
return
header = urllib.parse.urlencode(self.header_values)
header=header.encode('ascii')
request = urllib.request.Request(root, header)
http_response = urllib.request.urlopen(request)
root = http_response.geturl()
response_data= http_response.read()
links = re.findall(r'href="(.*?)"' , str(response_data))
links = self.get_actual_urls(links, root)
self.sites[root]=links
for each_link in links:
self.run_check(each_link)
def graph(self):
G = nx.Graph(self.sites)
label_dict = {}
for key, value in self.sites.items(): #that's not how it works... todo: later
label_dict[key]=self.remove_root(value)
nx.draw(G, with_labels=True, font_size=8 , node_size=1000, node_color="skyblue", edge_color='#A0FFA2', pos=nx.spring_layout(G))
plt.show()
def remove_root(self, links):
ret_links = []
for link in links:
ret_links.append(link.rsplit('.', 1)[0])
return ret_links
def clean(self):
self.sites.clear()
self.does_not_work.clear()
self.does_work.clear()

7
requirements Normal file
View File

@ -0,0 +1,7 @@
germalemma
flask
flask-ask
nltk
lxml
urllib
yaml