mirror of https://github.com/Askill/optar.git
added S3 support
This commit is contained in:
parent
28df77ea2c
commit
c3bb229011
|
|
@ -0,0 +1,4 @@
|
|||
from src.Watcher import Watcher
|
||||
|
||||
if __name__ == "__main__":
|
||||
Watcher("./optar/sites.txt", "./optar/keywords.txt").watch()
|
||||
|
|
@ -3,3 +3,4 @@ lxml==5.2.2
|
|||
requests==2.32.3
|
||||
trafilatura==1.11.0
|
||||
beautifulsoup4==4.12.3
|
||||
boto3==1.34.144
|
||||
|
|
@ -51,11 +51,9 @@ class SiteReader:
|
|||
|
||||
downloaded_url = trafilatura.fetch_url(url)
|
||||
try:
|
||||
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False,
|
||||
date_extraction_params={'extensive_search': True, 'original_date': True})
|
||||
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False)
|
||||
except AttributeError:
|
||||
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True,
|
||||
date_extraction_params={'extensive_search': True, 'original_date': True})
|
||||
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True)
|
||||
if a:
|
||||
json_output = json.loads(a)
|
||||
return json_output['text']
|
||||
|
|
|
|||
|
|
@ -0,0 +1,36 @@
|
|||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
import boto3
|
||||
|
||||
|
||||
class SiteStoreS3:
|
||||
def __init__(self, bucket):
|
||||
self.bucket = bucket
|
||||
|
||||
def get_site_history(self, cache_path) -> Optional[list[str]]:
|
||||
# Make sure you provide / in the end
|
||||
prefix = cache_path
|
||||
if cache_path[-1] != "/":
|
||||
prefix += "/"
|
||||
|
||||
s3 = boto3.client("s3")
|
||||
result = s3.list_objects_v2(Bucket=self.bucket, Prefix=cache_path, MaxKeys=21)
|
||||
if "Contents"not in result:
|
||||
return None
|
||||
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
|
||||
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]])
|
||||
|
||||
def get_site_links(self, path):
|
||||
s3 = boto3.resource('s3')
|
||||
obj = s3.Object(self.bucket,path)
|
||||
data=obj.get()['Body']
|
||||
return json.load(data)
|
||||
|
||||
def persist(self, path, data):
|
||||
s3 = boto3.resource('s3')
|
||||
s3object = s3.Object(self.bucket, path)
|
||||
s3object.put(
|
||||
Body=(bytes(json.dumps(data).encode('UTF-8')))
|
||||
)
|
||||
|
|
@ -5,12 +5,12 @@ from deepdiff import DeepDiff
|
|||
|
||||
from src.Crawler import Crawler
|
||||
from src.SiteReader import SiteReader
|
||||
from src.SiteStore import SiteStore
|
||||
from src.SiteStoreS3 import SiteStoreS3
|
||||
|
||||
|
||||
class Watcher:
|
||||
def __init__(self, sites_source_path, keywords_source_path) -> None:
|
||||
self.site_store = SiteStore()
|
||||
self.site_store = SiteStoreS3("optar-dev-cache")
|
||||
self.site_reader = SiteReader()
|
||||
self.keywords_source_path = keywords_source_path
|
||||
self.sites_source_path = sites_source_path
|
||||
|
|
@ -35,7 +35,7 @@ class Watcher:
|
|||
for site in sites:
|
||||
crawler = Crawler()
|
||||
crawler.run(site, 1)
|
||||
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
|
||||
self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
|
||||
|
||||
contents = [self.get_new_content(site) for site in sites]
|
||||
# TODO: improve handleing of None
|
||||
|
|
@ -56,16 +56,16 @@ class Watcher:
|
|||
|
||||
def get_new_content(self, url) -> Dict[str, str]:
|
||||
""" get all past iterations of a site by the fully qualified domain name """
|
||||
list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/")
|
||||
list_of_files = self.site_store.get_site_history(f"{self.remove_protocol(url)}/")
|
||||
|
||||
if len(list_of_files) >= 2:
|
||||
prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}")
|
||||
current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||
prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
|
||||
current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||
news = DeepDiff(prev_version, current_version, ignore_order=True)
|
||||
else:
|
||||
news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||
|
||||
sites_contents = self.site_reader.get_sites_content_static(list(news.keys()))
|
||||
news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
|
||||
if news:
|
||||
sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])
|
||||
|
||||
return sites_contents
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue