mirror of https://github.com/Askill/optar.git
added depth limit for crawler
This commit is contained in:
parent
22876fd145
commit
c7c3fa6102
|
|
@ -42,10 +42,12 @@ class Crawler:
|
|||
|
||||
def run(self, root, limit, sleep_time=0):
|
||||
self.url = root
|
||||
unchecked = [root]
|
||||
unchecked = [(0, root)]
|
||||
|
||||
while unchecked and len(self._links) < limit:
|
||||
root = unchecked.pop()
|
||||
while unchecked:
|
||||
level, root = unchecked.pop()
|
||||
if level > limit:
|
||||
continue
|
||||
if root in self._links or self.url.rsplit('/')[2] not in root:
|
||||
continue
|
||||
if "https" not in root:
|
||||
|
|
@ -73,9 +75,9 @@ class Crawler:
|
|||
for link in _links:
|
||||
if link not in n_links:
|
||||
if link.startswith("http"):
|
||||
n_links.append(link)
|
||||
n_links.append((level+1, link))
|
||||
else:
|
||||
n_links.append(urljoin(site.url, link))
|
||||
n_links.append((level+1, urljoin(site.url, link)))
|
||||
|
||||
unchecked += n_links
|
||||
self._links[root] = n_links
|
||||
|
|
|
|||
|
|
@ -27,8 +27,7 @@ class Watcher:
|
|||
|
||||
for site in sites:
|
||||
crawler = Crawler()
|
||||
# TODO: add depth as param, to lmit traversal depth
|
||||
crawler.run(site, 10)
|
||||
crawler.run(site, 1)
|
||||
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
|
||||
|
||||
contents = [self.get_new_content(site) for site in sites]
|
||||
|
|
@ -62,6 +61,8 @@ class Watcher:
|
|||
|
||||
@staticmethod
|
||||
def search_sites(url, content, keywords: List[str]):
|
||||
if content is None:
|
||||
return []
|
||||
results = []
|
||||
for keyword in keywords:
|
||||
if keyword in content:
|
||||
|
|
|
|||
Loading…
Reference in New Issue