From 563c6c719cd267d25f5a1e5fb4f6d57fae8895e9 Mon Sep 17 00:00:00 2001 From: Askill Date: Sun, 6 Nov 2022 15:18:57 +0100 Subject: [PATCH] todos --- sites.txt | 3 ++- src/Watcher.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sites.txt b/sites.txt index 6b14489..2dc13ca 100644 --- a/sites.txt +++ b/sites.txt @@ -1 +1,2 @@ -https://www.patricematz.de/ \ No newline at end of file +https://www.patricematz.de/ +https://www.heise.de/ \ No newline at end of file diff --git a/src/Watcher.py b/src/Watcher.py index 848391a..907ef8e 100644 --- a/src/Watcher.py +++ b/src/Watcher.py @@ -24,12 +24,15 @@ class Watcher: keywords = self.read_txt_file(self.keywords_source_path) sites = self.read_txt_file(self.sites_source_path) - crawler = Crawler() + for site in sites: + crawler = Crawler() + # TODO: add depth as param, to lmit traversal depth crawler.run(site, 10) crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json") contents = [self.get_new_content(site) for site in sites] + # TODO: improve handleing of None contents = [x for x in contents if x is not None and x is not {}] matches = [] for content in contents: