diff --git a/sites.txt b/sites.txt index 6b14489..2dc13ca 100644 --- a/sites.txt +++ b/sites.txt @@ -1 +1,2 @@ -https://www.patricematz.de/ \ No newline at end of file +https://www.patricematz.de/ +https://www.heise.de/ \ No newline at end of file diff --git a/src/Watcher.py b/src/Watcher.py index 848391a..907ef8e 100644 --- a/src/Watcher.py +++ b/src/Watcher.py @@ -24,12 +24,15 @@ class Watcher: keywords = self.read_txt_file(self.keywords_source_path) sites = self.read_txt_file(self.sites_source_path) - crawler = Crawler() + for site in sites: + crawler = Crawler() + # TODO: add depth as param, to lmit traversal depth crawler.run(site, 10) crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json") contents = [self.get_new_content(site) for site in sites] + # TODO: improve handleing of None contents = [x for x in contents if x is not None and x is not {}] matches = [] for content in contents: