added S3 support

This commit is contained in:
Askill 2024-07-15 16:12:40 +02:00
parent 28df77ea2c
commit c3bb229011
5 changed files with 53 additions and 14 deletions

4
dev.py Normal file
View File

@ -0,0 +1,4 @@
from src.Watcher import Watcher
if __name__ == "__main__":
Watcher("./optar/sites.txt", "./optar/keywords.txt").watch()

View File

@ -3,3 +3,4 @@ lxml==5.2.2
requests==2.32.3 requests==2.32.3
trafilatura==1.11.0 trafilatura==1.11.0
beautifulsoup4==4.12.3 beautifulsoup4==4.12.3
boto3==1.34.144

View File

@ -51,11 +51,9 @@ class SiteReader:
downloaded_url = trafilatura.fetch_url(url) downloaded_url = trafilatura.fetch_url(url)
try: try:
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False, a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, include_comments=False)
date_extraction_params={'extensive_search': True, 'original_date': True})
except AttributeError: except AttributeError:
a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True, a = trafilatura.extract(downloaded_url, output_format="json", with_metadata=True)
date_extraction_params={'extensive_search': True, 'original_date': True})
if a: if a:
json_output = json.loads(a) json_output = json.loads(a)
return json_output['text'] return json_output['text']

36
src/SiteStoreS3.py Normal file
View File

@ -0,0 +1,36 @@
import json
import os
from pathlib import Path
from typing import List, Optional
import boto3
class SiteStoreS3:
def __init__(self, bucket):
self.bucket = bucket
def get_site_history(self, cache_path) -> Optional[list[str]]:
# Make sure you provide / in the end
prefix = cache_path
if cache_path[-1] != "/":
prefix += "/"
s3 = boto3.client("s3")
result = s3.list_objects_v2(Bucket=self.bucket, Prefix=cache_path, MaxKeys=21)
if "Contents"not in result:
return None
# return a sorted list of file names (key), which are the creation dates, ignore the prefix (len(cache_path)), ignore the first element, as this is only the prefix
return sorted([x["Key"][len(cache_path) :] for x in result["Contents"][1:]])
def get_site_links(self, path):
s3 = boto3.resource('s3')
obj = s3.Object(self.bucket,path)
data=obj.get()['Body']
return json.load(data)
def persist(self, path, data):
s3 = boto3.resource('s3')
s3object = s3.Object(self.bucket, path)
s3object.put(
Body=(bytes(json.dumps(data).encode('UTF-8')))
)

View File

@ -5,12 +5,12 @@ from deepdiff import DeepDiff
from src.Crawler import Crawler from src.Crawler import Crawler
from src.SiteReader import SiteReader from src.SiteReader import SiteReader
from src.SiteStore import SiteStore from src.SiteStoreS3 import SiteStoreS3
class Watcher: class Watcher:
def __init__(self, sites_source_path, keywords_source_path) -> None: def __init__(self, sites_source_path, keywords_source_path) -> None:
self.site_store = SiteStore() self.site_store = SiteStoreS3("optar-dev-cache")
self.site_reader = SiteReader() self.site_reader = SiteReader()
self.keywords_source_path = keywords_source_path self.keywords_source_path = keywords_source_path
self.sites_source_path = sites_source_path self.sites_source_path = sites_source_path
@ -35,7 +35,7 @@ class Watcher:
for site in sites: for site in sites:
crawler = Crawler() crawler = Crawler()
crawler.run(site, 1) crawler.run(site, 1)
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json") self.site_store.persist(f"{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json", crawler.get_nodes())
contents = [self.get_new_content(site) for site in sites] contents = [self.get_new_content(site) for site in sites]
# TODO: improve handleing of None # TODO: improve handleing of None
@ -56,16 +56,16 @@ class Watcher:
def get_new_content(self, url) -> Dict[str, str]: def get_new_content(self, url) -> Dict[str, str]:
""" get all past iterations of a site by the fully qualified domain name """ """ get all past iterations of a site by the fully qualified domain name """
list_of_files = self.site_store.get_site_history(f"./cache/{self.remove_protocol(url)}/") list_of_files = self.site_store.get_site_history(f"{self.remove_protocol(url)}/")
if len(list_of_files) >= 2: if len(list_of_files) >= 2:
prev_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-2]}") prev_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-2]}")
current_version = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}") current_version = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
news = DeepDiff(prev_version, current_version, ignore_order=True) news = DeepDiff(prev_version, current_version, ignore_order=True)
else: else:
news = self.site_store.get_site_links(f"./cache/{self.remove_protocol(url)}/{list_of_files[-1]}") news = self.site_store.get_site_links(f"{self.remove_protocol(url)}/{list_of_files[-1]}")
if news:
sites_contents = self.site_reader.get_sites_content_static(list(news.keys())) sites_contents = self.site_reader.get_sites_content_static([z.split("'")[1] for z in list(news["dictionary_item_added"])])
return sites_contents return sites_contents