mirror of https://github.com/Askill/optar.git
added Dockerfile and tested with updated packages
This commit is contained in:
parent
9c968fe66c
commit
28df77ea2c
|
|
@ -0,0 +1,14 @@
|
||||||
|
FROM python:slim
|
||||||
|
WORKDIR /optar
|
||||||
|
|
||||||
|
# Copy and run requirements install first to save time in following builds
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
RUN pip install -r requirements.txt
|
||||||
|
|
||||||
|
COPY prod.py prod.py
|
||||||
|
COPY src ./src
|
||||||
|
COPY cache ./cache
|
||||||
|
COPY keywords.txt keywords.txt
|
||||||
|
COPY sites.txt sites.txt
|
||||||
|
|
||||||
|
ENTRYPOINT [ "python", "prod.py" ]
|
||||||
|
|
@ -1 +1,25 @@
|
||||||
{"https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": []}
|
{
|
||||||
|
"https://www.patricematz.de/": [
|
||||||
|
"https://www.patricematz.de/",
|
||||||
|
"https://www.linkedin.com/in/patrice-matz-b73b6814a/",
|
||||||
|
"https://github.com/Askill",
|
||||||
|
"https://www.patricematz.de/images/praktikum.pdf",
|
||||||
|
"https://www.patricematz.de/images/bachelor.pdf",
|
||||||
|
"https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf",
|
||||||
|
"https://irs.projects.patricematz.de",
|
||||||
|
"https://github.com/Askill/Inverse-Rezeptsuche",
|
||||||
|
"https://irs.projects.patricematz.de/",
|
||||||
|
"https://github.com/Askill/Video-Synopsis",
|
||||||
|
"https://github.com/Askill/UI",
|
||||||
|
"https://github.com/Askill/Photo-Wall",
|
||||||
|
"https://www.patricematz.de/photowall/demo/",
|
||||||
|
"https://github.com/Askill/Flask-URL-Checker",
|
||||||
|
"https://patricematz.de/starmapper.htm"
|
||||||
|
],
|
||||||
|
"https://www.patricematz.de/photowall/demo/": [
|
||||||
|
"javascript:void(0)"
|
||||||
|
],
|
||||||
|
"https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [],
|
||||||
|
"https://www.patricematz.de/images/bachelor.pdf": [],
|
||||||
|
"https://www.patricematz.de/images/praktikum.pdf": []
|
||||||
|
}
|
||||||
|
|
@ -1 +1,5 @@
|
||||||
deepdiff
|
deepdiff==7.0.1
|
||||||
|
lxml==5.2.2
|
||||||
|
requests==2.32.3
|
||||||
|
trafilatura==1.11.0
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
|
@ -1,2 +1 @@
|
||||||
https://www.patricematz.de/
|
https://www.patricematz.de/
|
||||||
https://www.heise.de/
|
|
||||||
|
|
@ -5,7 +5,7 @@ import trafilatura
|
||||||
from requests.exceptions import MissingSchema
|
from requests.exceptions import MissingSchema
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# Pretty sure most of this code is not from me, but from a demo on trafilatura
|
||||||
class SiteReader:
|
class SiteReader:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
@ -37,7 +37,7 @@ class SiteReader:
|
||||||
'script',
|
'script',
|
||||||
'style', ]
|
'style', ]
|
||||||
|
|
||||||
# Then we will loop over every item in the extract text and make sure that the beautifulsoup4 tag
|
# Then we will loop over every item in the extracted text and make sure that the beautifulsoup4 tag
|
||||||
# is NOT in the blacklist
|
# is NOT in the blacklist
|
||||||
for item in text:
|
for item in text:
|
||||||
if item.parent.name not in blacklist:
|
if item.parent.name not in blacklist:
|
||||||
|
|
@ -73,6 +73,7 @@ class SiteReader:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_sites_content_dynamic(self, urls: List[str]):
|
def get_sites_content_dynamic(self, urls: List[str]):
|
||||||
|
'''not implemented'''
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_sites_content_static(self, urls: List[str]) -> Dict[str, str]:
|
def get_sites_content_static(self, urls: List[str]) -> Dict[str, str]:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue