From 28df77ea2c74d54505eff20d8752b7657cafecae Mon Sep 17 00:00:00 2001 From: Askill Date: Sat, 13 Jul 2024 15:11:29 +0200 Subject: [PATCH] added Dockerfile and tested with updated packages --- Dockerfile | 14 ++++++++++ .../2022-11-06_15-16-36.json | 26 ++++++++++++++++++- requirements.txt | 6 ++++- sites.txt | 3 +-- src/SiteReader.py | 5 ++-- 5 files changed, 48 insertions(+), 6 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6034cbd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM python:slim +WORKDIR /optar + +# Copy and run requirements install first to save time in following builds +COPY requirements.txt requirements.txt +RUN pip install -r requirements.txt + +COPY prod.py prod.py +COPY src ./src +COPY cache ./cache +COPY keywords.txt keywords.txt +COPY sites.txt sites.txt + +ENTRYPOINT [ "python", "prod.py" ] \ No newline at end of file diff --git a/cache/www.patricematz.de/2022-11-06_15-16-36.json b/cache/www.patricematz.de/2022-11-06_15-16-36.json index 3cf2ba4..b5f4212 100644 --- a/cache/www.patricematz.de/2022-11-06_15-16-36.json +++ b/cache/www.patricematz.de/2022-11-06_15-16-36.json @@ -1 +1,25 @@ -{"https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": []} \ No newline at end of file +{ + "https://www.patricematz.de/": [ + "https://www.patricematz.de/", + "https://www.linkedin.com/in/patrice-matz-b73b6814a/", + "https://github.com/Askill", + "https://www.patricematz.de/images/praktikum.pdf", + "https://www.patricematz.de/images/bachelor.pdf", + "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", + "https://irs.projects.patricematz.de", + "https://github.com/Askill/Inverse-Rezeptsuche", + "https://irs.projects.patricematz.de/", + "https://github.com/Askill/Video-Synopsis", + "https://github.com/Askill/UI", + "https://github.com/Askill/Photo-Wall", + "https://www.patricematz.de/photowall/demo/", + "https://github.com/Askill/Flask-URL-Checker", + "https://patricematz.de/starmapper.htm" + ], + "https://www.patricematz.de/photowall/demo/": [ + "javascript:void(0)" + ], + "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], + "https://www.patricematz.de/images/bachelor.pdf": [], + "https://www.patricematz.de/images/praktikum.pdf": [] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2f7a821..c9044db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,5 @@ -deepdiff \ No newline at end of file +deepdiff==7.0.1 +lxml==5.2.2 +requests==2.32.3 +trafilatura==1.11.0 +beautifulsoup4==4.12.3 \ No newline at end of file diff --git a/sites.txt b/sites.txt index 2dc13ca..6b14489 100644 --- a/sites.txt +++ b/sites.txt @@ -1,2 +1 @@ -https://www.patricematz.de/ -https://www.heise.de/ \ No newline at end of file +https://www.patricematz.de/ \ No newline at end of file diff --git a/src/SiteReader.py b/src/SiteReader.py index 24f03dc..9f6abae 100644 --- a/src/SiteReader.py +++ b/src/SiteReader.py @@ -5,7 +5,7 @@ import trafilatura from requests.exceptions import MissingSchema from bs4 import BeautifulSoup - +# Pretty sure most of this code is not from me, but from a demo on trafilatura class SiteReader: def __init__(self): pass @@ -37,7 +37,7 @@ class SiteReader: 'script', 'style', ] - # Then we will loop over every item in the extract text and make sure that the beautifulsoup4 tag + # Then we will loop over every item in the extracted text and make sure that the beautifulsoup4 tag # is NOT in the blacklist for item in text: if item.parent.name not in blacklist: @@ -73,6 +73,7 @@ class SiteReader: return None def get_sites_content_dynamic(self, urls: List[str]): + '''not implemented''' pass def get_sites_content_static(self, urls: List[str]) -> Dict[str, str]: