added Dockerfile and tested with updated packages

This commit is contained in:
Askill 2024-07-13 15:11:29 +02:00
parent 9c968fe66c
commit 28df77ea2c
5 changed files with 48 additions and 6 deletions

14
Dockerfile Normal file
View File

@ -0,0 +1,14 @@
FROM python:slim
WORKDIR /optar
# Copy and run requirements install first to save time in following builds
COPY requirements.txt requirements.txt
RUN pip install -r requirements.txt
COPY prod.py prod.py
COPY src ./src
COPY cache ./cache
COPY keywords.txt keywords.txt
COPY sites.txt sites.txt
ENTRYPOINT [ "python", "prod.py" ]

View File

@ -1 +1,25 @@
{"https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": []}
{
"https://www.patricematz.de/": [
"https://www.patricematz.de/",
"https://www.linkedin.com/in/patrice-matz-b73b6814a/",
"https://github.com/Askill",
"https://www.patricematz.de/images/praktikum.pdf",
"https://www.patricematz.de/images/bachelor.pdf",
"https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf",
"https://irs.projects.patricematz.de",
"https://github.com/Askill/Inverse-Rezeptsuche",
"https://irs.projects.patricematz.de/",
"https://github.com/Askill/Video-Synopsis",
"https://github.com/Askill/UI",
"https://github.com/Askill/Photo-Wall",
"https://www.patricematz.de/photowall/demo/",
"https://github.com/Askill/Flask-URL-Checker",
"https://patricematz.de/starmapper.htm"
],
"https://www.patricematz.de/photowall/demo/": [
"javascript:void(0)"
],
"https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [],
"https://www.patricematz.de/images/bachelor.pdf": [],
"https://www.patricematz.de/images/praktikum.pdf": []
}

View File

@ -1 +1,5 @@
deepdiff
deepdiff==7.0.1
lxml==5.2.2
requests==2.32.3
trafilatura==1.11.0
beautifulsoup4==4.12.3

View File

@ -1,2 +1 @@
https://www.patricematz.de/
https://www.heise.de/
https://www.patricematz.de/

View File

@ -5,7 +5,7 @@ import trafilatura
from requests.exceptions import MissingSchema
from bs4 import BeautifulSoup
# Pretty sure most of this code is not from me, but from a demo on trafilatura
class SiteReader:
def __init__(self):
pass
@ -37,7 +37,7 @@ class SiteReader:
'script',
'style', ]
# Then we will loop over every item in the extract text and make sure that the beautifulsoup4 tag
# Then we will loop over every item in the extracted text and make sure that the beautifulsoup4 tag
# is NOT in the blacklist
for item in text:
if item.parent.name not in blacklist:
@ -73,6 +73,7 @@ class SiteReader:
return None
def get_sites_content_dynamic(self, urls: List[str]):
'''not implemented'''
pass
def get_sites_content_static(self, urls: List[str]) -> Dict[str, str]: