added depth limit for crawler

This commit is contained in:
Askill 2022-12-11 13:16:43 +01:00
parent 22876fd145
commit c7c3fa6102
2 changed files with 10 additions and 7 deletions

View File

@ -42,10 +42,12 @@ class Crawler:
def run(self, root, limit, sleep_time=0): def run(self, root, limit, sleep_time=0):
self.url = root self.url = root
unchecked = [root] unchecked = [(0, root)]
while unchecked and len(self._links) < limit: while unchecked:
root = unchecked.pop() level, root = unchecked.pop()
if level > limit:
continue
if root in self._links or self.url.rsplit('/')[2] not in root: if root in self._links or self.url.rsplit('/')[2] not in root:
continue continue
if "https" not in root: if "https" not in root:
@ -73,9 +75,9 @@ class Crawler:
for link in _links: for link in _links:
if link not in n_links: if link not in n_links:
if link.startswith("http"): if link.startswith("http"):
n_links.append(link) n_links.append((level+1, link))
else: else:
n_links.append(urljoin(site.url, link)) n_links.append((level+1, urljoin(site.url, link)))
unchecked += n_links unchecked += n_links
self._links[root] = n_links self._links[root] = n_links

View File

@ -27,8 +27,7 @@ class Watcher:
for site in sites: for site in sites:
crawler = Crawler() crawler = Crawler()
# TODO: add depth as param, to lmit traversal depth crawler.run(site, 1)
crawler.run(site, 10)
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json") crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
contents = [self.get_new_content(site) for site in sites] contents = [self.get_new_content(site) for site in sites]
@ -62,6 +61,8 @@ class Watcher:
@staticmethod @staticmethod
def search_sites(url, content, keywords: List[str]): def search_sites(url, content, keywords: List[str]):
if content is None:
return []
results = [] results = []
for keyword in keywords: for keyword in keywords:
if keyword in content: if keyword in content: