From 28df77ea2c74d54505eff20d8752b7657cafecae Mon Sep 17 00:00:00 2001
From: Askill <ofjokg@gmail.com>
Date: Sat, 13 Jul 2024 15:11:29 +0200
Subject: [PATCH] added Dockerfile and tested with updated packages

---
 Dockerfile                                    | 14 ++++++++++
 .../2022-11-06_15-16-36.json                  | 26 ++++++++++++++++++-
 requirements.txt                              |  6 ++++-
 sites.txt                                     |  3 +--
 src/SiteReader.py                             |  5 ++--
 5 files changed, 48 insertions(+), 6 deletions(-)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..6034cbd
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:slim
+WORKDIR /optar
+
+# Copy and run requirements install first to save time in following builds
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+COPY prod.py prod.py
+COPY src ./src
+COPY cache ./cache
+COPY keywords.txt keywords.txt
+COPY sites.txt sites.txt
+
+ENTRYPOINT [ "python", "prod.py" ]
\ No newline at end of file
diff --git a/cache/www.patricematz.de/2022-11-06_15-16-36.json b/cache/www.patricematz.de/2022-11-06_15-16-36.json
index 3cf2ba4..b5f4212 100644
--- a/cache/www.patricematz.de/2022-11-06_15-16-36.json
+++ b/cache/www.patricematz.de/2022-11-06_15-16-36.json
@@ -1 +1,25 @@
-{"https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": []}
\ No newline at end of file
+{
+    "https://www.patricematz.de/": [
+        "https://www.patricematz.de/",
+        "https://www.linkedin.com/in/patrice-matz-b73b6814a/",
+        "https://github.com/Askill",
+        "https://www.patricematz.de/images/praktikum.pdf",
+        "https://www.patricematz.de/images/bachelor.pdf",
+        "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf",
+        "https://irs.projects.patricematz.de",
+        "https://github.com/Askill/Inverse-Rezeptsuche",
+        "https://irs.projects.patricematz.de/",
+        "https://github.com/Askill/Video-Synopsis",
+        "https://github.com/Askill/UI",
+        "https://github.com/Askill/Photo-Wall",
+        "https://www.patricematz.de/photowall/demo/",
+        "https://github.com/Askill/Flask-URL-Checker",
+        "https://patricematz.de/starmapper.htm"
+    ],
+    "https://www.patricematz.de/photowall/demo/": [
+        "javascript:void(0)"
+    ],
+    "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [],
+    "https://www.patricematz.de/images/bachelor.pdf": [],
+    "https://www.patricematz.de/images/praktikum.pdf": []
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 2f7a821..c9044db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,5 @@
-deepdiff
\ No newline at end of file
+deepdiff==7.0.1
+lxml==5.2.2
+requests==2.32.3
+trafilatura==1.11.0
+beautifulsoup4==4.12.3
\ No newline at end of file
diff --git a/sites.txt b/sites.txt
index 2dc13ca..6b14489 100644
--- a/sites.txt
+++ b/sites.txt
@@ -1,2 +1 @@
-https://www.patricematz.de/
-https://www.heise.de/
\ No newline at end of file
+https://www.patricematz.de/
\ No newline at end of file
diff --git a/src/SiteReader.py b/src/SiteReader.py
index 24f03dc..9f6abae 100644
--- a/src/SiteReader.py
+++ b/src/SiteReader.py
@@ -5,7 +5,7 @@ import trafilatura
 from requests.exceptions import MissingSchema
 from bs4 import BeautifulSoup
 
-
+# Pretty sure most of this code is not from me, but from a demo on trafilatura
 class SiteReader:
     def __init__(self):
         pass
@@ -37,7 +37,7 @@ class SiteReader:
             'script',
             'style', ]
 
-        # Then we will loop over every item in the extract text and make sure that the beautifulsoup4 tag
+        # Then we will loop over every item in the extracted text and make sure that the beautifulsoup4 tag
         # is NOT in the blacklist
         for item in text:
             if item.parent.name not in blacklist:
@@ -73,6 +73,7 @@ class SiteReader:
                 return None
 
     def get_sites_content_dynamic(self, urls: List[str]):
+        '''not implemented'''
         pass
 
     def get_sites_content_static(self, urls: List[str]) -> Dict[str, str]: