diff --git a/src/Crawler.py b/src/Crawler.py index eab1800..9f68bf1 100644 --- a/src/Crawler.py +++ b/src/Crawler.py @@ -42,10 +42,12 @@ class Crawler: def run(self, root, limit, sleep_time=0): self.url = root - unchecked = [root] + unchecked = [(0, root)] - while unchecked and len(self._links) < limit: - root = unchecked.pop() + while unchecked: + level, root = unchecked.pop() + if level > limit: + continue if root in self._links or self.url.rsplit('/')[2] not in root: continue if "https" not in root: @@ -73,9 +75,9 @@ class Crawler: for link in _links: if link not in n_links: if link.startswith("http"): - n_links.append(link) + n_links.append((level+1, link)) else: - n_links.append(urljoin(site.url, link)) + n_links.append((level+1, urljoin(site.url, link))) unchecked += n_links self._links[root] = n_links diff --git a/src/Watcher.py b/src/Watcher.py index 907ef8e..6db3e79 100644 --- a/src/Watcher.py +++ b/src/Watcher.py @@ -27,8 +27,7 @@ class Watcher: for site in sites: crawler = Crawler() - # TODO: add depth as param, to lmit traversal depth - crawler.run(site, 10) + crawler.run(site, 1) crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json") contents = [self.get_new_content(site) for site in sites] @@ -62,6 +61,8 @@ class Watcher: @staticmethod def search_sites(url, content, keywords: List[str]): + if content is None: + return [] results = [] for keyword in keywords: if keyword in content: