fixed crawling
This commit is contained in:
parent
56b2046896
commit
2ac3dd62e9
18
Star.py
18
Star.py
|
|
@ -42,32 +42,31 @@ class Crawler:
|
|||
if logger:
|
||||
self.logger = logger
|
||||
else:
|
||||
|
||||
self.logger = logging.Logger(
|
||||
name="star_crawler", level=logging.INFO)
|
||||
|
||||
def run(self, root, limit, sleep_time=0):
|
||||
self.url = root
|
||||
nlinks = [root]
|
||||
unchecked = [root]
|
||||
|
||||
while nlinks and len(self.links) < limit:
|
||||
root = nlinks.pop()
|
||||
while unchecked and len(self.links) < limit:
|
||||
root = unchecked.pop()
|
||||
if root in self.links or self.url.rsplit('/')[2] not in root:
|
||||
return
|
||||
continue
|
||||
if "https" not in root:
|
||||
return
|
||||
continue
|
||||
for element in self.exclude:
|
||||
if element in root:
|
||||
return
|
||||
continue
|
||||
self.logger.info(root)
|
||||
try:
|
||||
site = requests.get(root)
|
||||
tree = html.fromstring(site.content)
|
||||
links = tree.xpath('//a/@href')
|
||||
except:
|
||||
return
|
||||
continue
|
||||
|
||||
nlinks = []
|
||||
nlinks=[]
|
||||
for link in links:
|
||||
if link not in nlinks:
|
||||
if link.startswith("http"):
|
||||
|
|
@ -75,6 +74,7 @@ class Crawler:
|
|||
else:
|
||||
nlinks.append(urljoin(site.url, link))
|
||||
|
||||
unchecked += nlinks
|
||||
self.links[root] = nlinks
|
||||
sleep(sleep_time)
|
||||
|
||||
|
|
|
|||
49
app.py
49
app.py
|
|
@ -9,34 +9,36 @@ import sys
|
|||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
def graph(url):
|
||||
obj = Crawler(url)
|
||||
obj.run_check(url)
|
||||
|
||||
current = os.path.dirname(__file__)
|
||||
|
||||
def transformForDrawing(n, e):
|
||||
nodes = []
|
||||
drawn = []
|
||||
edges = []
|
||||
for key, values in obj.sites.items():
|
||||
label = key.rsplit('/')[-1]
|
||||
for nn in n:
|
||||
label = nn.rsplit('/')[-1]
|
||||
if label == "":
|
||||
label = key.rsplit('/')[-2]
|
||||
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(key, label, 0) + '}')
|
||||
drawn.append(key)
|
||||
label = nn.rsplit('/')[-2]
|
||||
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(nn, label, 0) + '}\n')
|
||||
drawn.append(nn)
|
||||
|
||||
for value in values:
|
||||
if value not in drawn and value not in obj.sites:
|
||||
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(value, value, 1) + '}')
|
||||
drawn.append(value)
|
||||
for ee in e:
|
||||
if ee[1] not in drawn and ee[1] not in n:
|
||||
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(ee[1], ee[1], 1) + '}\n')
|
||||
drawn.append(ee[1])
|
||||
|
||||
for value in values:
|
||||
edges.append('{' + "from: '{}', to: '{}'".format(key, value) + '}')
|
||||
edges.append('{' + "from: '{}', to: '{}'".format(ee[0], ee[1]) + '}\n')
|
||||
|
||||
return nodes, edges
|
||||
|
||||
def graph(url):
|
||||
obj = Crawler()
|
||||
obj.run(url, 5000)
|
||||
|
||||
current = os.path.dirname(__file__)
|
||||
n, e = obj.getNodesEdges()
|
||||
with open(os.path.join(current, './cached/' + url.rsplit('/')[2] + '.json'), 'w', encoding='utf-8') as f:
|
||||
f.write(json.dumps({"nodes": nodes,"edges": edges}))
|
||||
f.write(json.dumps({"nodes": n,"edges": e}))
|
||||
|
||||
nodes, edges = transformForDrawing(n, e)
|
||||
return nodes, edges
|
||||
|
||||
|
||||
|
|
@ -48,6 +50,7 @@ def load(url):
|
|||
jsonContent = json.loads(content)
|
||||
nodes = jsonContent["nodes"]
|
||||
edges = jsonContent["edges"]
|
||||
nodes, edges = transformForDrawing(nodes, edges)
|
||||
return nodes, edges
|
||||
|
||||
#----------------------------------------------------------------------------#
|
||||
|
|
@ -57,8 +60,7 @@ def load(url):
|
|||
|
||||
@app.route('/')
|
||||
def index():
|
||||
url = "beauty"
|
||||
|
||||
url = request.args.get("url")
|
||||
cached = os.listdir(os.path.join(os.path.dirname(__file__), "./cached"))
|
||||
withoutProtocol = url
|
||||
if withoutProtocol + '.json' not in cached:
|
||||
|
|
@ -69,9 +71,8 @@ def index():
|
|||
str1 = ","
|
||||
nodes = str1.join(nodes)
|
||||
edges = str1.join(edges)
|
||||
with open("./templates/data.js", "w") as f:
|
||||
f.write(f"var nodes={nodes}\nvar=edges={edges}")
|
||||
return render_template('graph.html')
|
||||
|
||||
return render_template('graph.html', nodes = nodes, edges = edges)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
|
|
@ -1 +0,0 @@
|
|||
{"nodes": ["{id: 'https://budgetbytes.lpages.co/budget-bytes-app/', label: 'budget-bytes-app', group: 0}", "{id: 'https://itunes.apple.com/app/apple-store/id1240079167?pt=95954917&ct=Landing%20page&mt=8', label: 'https://itunes.apple.com/app/apple-store/id1240079167?pt=95954917&ct=Landing%20page&mt=8', group: 1}", "{id: 'https://goo.gl/wiZ4FQ', label: 'https://goo.gl/wiZ4FQ', group: 1}", "{id: 'mailto:budgetbytesapp@sidechef.com', label: 'mailto:budgetbytesapp@sidechef.com', group: 1}", "{id: 'https://www.budgetbytes.com/', label: 'https://www.budgetbytes.com/', group: 1}", "{id: 'https://www.sidechef.com/', label: 'https://www.sidechef.com/', group: 1}"], "edges": ["{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://itunes.apple.com/app/apple-store/id1240079167?pt=95954917&ct=Landing%20page&mt=8'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://goo.gl/wiZ4FQ'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'mailto:budgetbytesapp@sidechef.com'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://www.budgetbytes.com/'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://www.sidechef.com/'}"]}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1,7 +1,7 @@
|
|||
<html>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.js"></script>
|
||||
<link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.css">
|
||||
<div id="mynetwork" style = "background-color: grey;"></div>
|
||||
<div id="mynetwork" style = "background-color: rgb(39, 39, 39);"></div>
|
||||
<script type="text/javascript">
|
||||
var color = 'gray';
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue