mirror of https://github.com/Askill/optar.git
added depth limit for crawler
This commit is contained in:
parent
22876fd145
commit
c7c3fa6102
|
|
@ -42,10 +42,12 @@ class Crawler:
|
||||||
|
|
||||||
def run(self, root, limit, sleep_time=0):
|
def run(self, root, limit, sleep_time=0):
|
||||||
self.url = root
|
self.url = root
|
||||||
unchecked = [root]
|
unchecked = [(0, root)]
|
||||||
|
|
||||||
while unchecked and len(self._links) < limit:
|
while unchecked:
|
||||||
root = unchecked.pop()
|
level, root = unchecked.pop()
|
||||||
|
if level > limit:
|
||||||
|
continue
|
||||||
if root in self._links or self.url.rsplit('/')[2] not in root:
|
if root in self._links or self.url.rsplit('/')[2] not in root:
|
||||||
continue
|
continue
|
||||||
if "https" not in root:
|
if "https" not in root:
|
||||||
|
|
@ -73,9 +75,9 @@ class Crawler:
|
||||||
for link in _links:
|
for link in _links:
|
||||||
if link not in n_links:
|
if link not in n_links:
|
||||||
if link.startswith("http"):
|
if link.startswith("http"):
|
||||||
n_links.append(link)
|
n_links.append((level+1, link))
|
||||||
else:
|
else:
|
||||||
n_links.append(urljoin(site.url, link))
|
n_links.append((level+1, urljoin(site.url, link)))
|
||||||
|
|
||||||
unchecked += n_links
|
unchecked += n_links
|
||||||
self._links[root] = n_links
|
self._links[root] = n_links
|
||||||
|
|
|
||||||
|
|
@ -27,8 +27,7 @@ class Watcher:
|
||||||
|
|
||||||
for site in sites:
|
for site in sites:
|
||||||
crawler = Crawler()
|
crawler = Crawler()
|
||||||
# TODO: add depth as param, to lmit traversal depth
|
crawler.run(site, 1)
|
||||||
crawler.run(site, 10)
|
|
||||||
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
|
crawler.persist(f"./cache/{self.remove_protocol(site)}/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json")
|
||||||
|
|
||||||
contents = [self.get_new_content(site) for site in sites]
|
contents = [self.get_new_content(site) for site in sites]
|
||||||
|
|
@ -62,6 +61,8 @@ class Watcher:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def search_sites(url, content, keywords: List[str]):
|
def search_sites(url, content, keywords: List[str]):
|
||||||
|
if content is None:
|
||||||
|
return []
|
||||||
results = []
|
results = []
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
if keyword in content:
|
if keyword in content:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue