mirror of https://github.com/Askill/optar.git
added Dockerfile and tested with updated packages
This commit is contained in:
parent
9c968fe66c
commit
28df77ea2c
|
|
@ -0,0 +1,14 @@
|
|||
FROM python:slim
|
||||
WORKDIR /optar
|
||||
|
||||
# Copy and run requirements install first to save time in following builds
|
||||
COPY requirements.txt requirements.txt
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
COPY prod.py prod.py
|
||||
COPY src ./src
|
||||
COPY cache ./cache
|
||||
COPY keywords.txt keywords.txt
|
||||
COPY sites.txt sites.txt
|
||||
|
||||
ENTRYPOINT [ "python", "prod.py" ]
|
||||
|
|
@ -1 +1,25 @@
|
|||
{"https://www.patricematz.de/": ["https://www.patricematz.de/", "https://www.linkedin.com/in/patrice-matz-b73b6814a/", "https://github.com/Askill", "https://www.patricematz.de/images/praktikum.pdf", "https://www.patricematz.de/images/bachelor.pdf", "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf", "https://irs.projects.patricematz.de", "https://github.com/Askill/Inverse-Rezeptsuche", "https://irs.projects.patricematz.de/", "https://github.com/Askill/Video-Synopsis", "https://github.com/Askill/UI", "https://github.com/Askill/Photo-Wall", "https://www.patricematz.de/photowall/demo/", "https://github.com/Askill/Flask-URL-Checker", "https://patricematz.de/starmapper.htm"], "https://www.patricematz.de/photowall/demo/": ["javascript:void(0)"], "https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [], "https://www.patricematz.de/images/bachelor.pdf": [], "https://www.patricematz.de/images/praktikum.pdf": []}
|
||||
{
|
||||
"https://www.patricematz.de/": [
|
||||
"https://www.patricematz.de/",
|
||||
"https://www.linkedin.com/in/patrice-matz-b73b6814a/",
|
||||
"https://github.com/Askill",
|
||||
"https://www.patricematz.de/images/praktikum.pdf",
|
||||
"https://www.patricematz.de/images/bachelor.pdf",
|
||||
"https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf",
|
||||
"https://irs.projects.patricematz.de",
|
||||
"https://github.com/Askill/Inverse-Rezeptsuche",
|
||||
"https://irs.projects.patricematz.de/",
|
||||
"https://github.com/Askill/Video-Synopsis",
|
||||
"https://github.com/Askill/UI",
|
||||
"https://github.com/Askill/Photo-Wall",
|
||||
"https://www.patricematz.de/photowall/demo/",
|
||||
"https://github.com/Askill/Flask-URL-Checker",
|
||||
"https://patricematz.de/starmapper.htm"
|
||||
],
|
||||
"https://www.patricematz.de/photowall/demo/": [
|
||||
"javascript:void(0)"
|
||||
],
|
||||
"https://www.patricematz.de/images/21-Master-Thesis-Matz.pdf": [],
|
||||
"https://www.patricematz.de/images/bachelor.pdf": [],
|
||||
"https://www.patricematz.de/images/praktikum.pdf": []
|
||||
}
|
||||
|
|
@ -1 +1,5 @@
|
|||
deepdiff
|
||||
deepdiff==7.0.1
|
||||
lxml==5.2.2
|
||||
requests==2.32.3
|
||||
trafilatura==1.11.0
|
||||
beautifulsoup4==4.12.3
|
||||
|
|
@ -5,7 +5,7 @@ import trafilatura
|
|||
from requests.exceptions import MissingSchema
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
# Pretty sure most of this code is not from me, but from a demo on trafilatura
|
||||
class SiteReader:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
|
@ -37,7 +37,7 @@ class SiteReader:
|
|||
'script',
|
||||
'style', ]
|
||||
|
||||
# Then we will loop over every item in the extract text and make sure that the beautifulsoup4 tag
|
||||
# Then we will loop over every item in the extracted text and make sure that the beautifulsoup4 tag
|
||||
# is NOT in the blacklist
|
||||
for item in text:
|
||||
if item.parent.name not in blacklist:
|
||||
|
|
@ -73,6 +73,7 @@ class SiteReader:
|
|||
return None
|
||||
|
||||
def get_sites_content_dynamic(self, urls: List[str]):
|
||||
'''not implemented'''
|
||||
pass
|
||||
|
||||
def get_sites_content_static(self, urls: List[str]) -> Dict[str, str]:
|
||||
|
|
|
|||
Loading…
Reference in New Issue