fixed crawling
This commit is contained in:
parent
56b2046896
commit
2ac3dd62e9
16
Star.py
16
Star.py
|
|
@ -42,30 +42,29 @@ class Crawler:
|
||||||
if logger:
|
if logger:
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
else:
|
else:
|
||||||
|
|
||||||
self.logger = logging.Logger(
|
self.logger = logging.Logger(
|
||||||
name="star_crawler", level=logging.INFO)
|
name="star_crawler", level=logging.INFO)
|
||||||
|
|
||||||
def run(self, root, limit, sleep_time=0):
|
def run(self, root, limit, sleep_time=0):
|
||||||
self.url = root
|
self.url = root
|
||||||
nlinks = [root]
|
unchecked = [root]
|
||||||
|
|
||||||
while nlinks and len(self.links) < limit:
|
while unchecked and len(self.links) < limit:
|
||||||
root = nlinks.pop()
|
root = unchecked.pop()
|
||||||
if root in self.links or self.url.rsplit('/')[2] not in root:
|
if root in self.links or self.url.rsplit('/')[2] not in root:
|
||||||
return
|
continue
|
||||||
if "https" not in root:
|
if "https" not in root:
|
||||||
return
|
continue
|
||||||
for element in self.exclude:
|
for element in self.exclude:
|
||||||
if element in root:
|
if element in root:
|
||||||
return
|
continue
|
||||||
self.logger.info(root)
|
self.logger.info(root)
|
||||||
try:
|
try:
|
||||||
site = requests.get(root)
|
site = requests.get(root)
|
||||||
tree = html.fromstring(site.content)
|
tree = html.fromstring(site.content)
|
||||||
links = tree.xpath('//a/@href')
|
links = tree.xpath('//a/@href')
|
||||||
except:
|
except:
|
||||||
return
|
continue
|
||||||
|
|
||||||
nlinks=[]
|
nlinks=[]
|
||||||
for link in links:
|
for link in links:
|
||||||
|
|
@ -75,6 +74,7 @@ class Crawler:
|
||||||
else:
|
else:
|
||||||
nlinks.append(urljoin(site.url, link))
|
nlinks.append(urljoin(site.url, link))
|
||||||
|
|
||||||
|
unchecked += nlinks
|
||||||
self.links[root] = nlinks
|
self.links[root] = nlinks
|
||||||
sleep(sleep_time)
|
sleep(sleep_time)
|
||||||
|
|
||||||
|
|
|
||||||
49
app.py
49
app.py
|
|
@ -9,34 +9,36 @@ import sys
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
def transformForDrawing(n, e):
|
||||||
def graph(url):
|
|
||||||
obj = Crawler(url)
|
|
||||||
obj.run_check(url)
|
|
||||||
|
|
||||||
current = os.path.dirname(__file__)
|
|
||||||
|
|
||||||
nodes = []
|
nodes = []
|
||||||
drawn = []
|
drawn = []
|
||||||
edges = []
|
edges = []
|
||||||
for key, values in obj.sites.items():
|
for nn in n:
|
||||||
label = key.rsplit('/')[-1]
|
label = nn.rsplit('/')[-1]
|
||||||
if label == "":
|
if label == "":
|
||||||
label = key.rsplit('/')[-2]
|
label = nn.rsplit('/')[-2]
|
||||||
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(key, label, 0) + '}')
|
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(nn, label, 0) + '}\n')
|
||||||
drawn.append(key)
|
drawn.append(nn)
|
||||||
|
|
||||||
for value in values:
|
for ee in e:
|
||||||
if value not in drawn and value not in obj.sites:
|
if ee[1] not in drawn and ee[1] not in n:
|
||||||
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(value, value, 1) + '}')
|
nodes.append('{' + "id: '{}', label: '{}', group: {}".format(ee[1], ee[1], 1) + '}\n')
|
||||||
drawn.append(value)
|
drawn.append(ee[1])
|
||||||
|
|
||||||
for value in values:
|
edges.append('{' + "from: '{}', to: '{}'".format(ee[0], ee[1]) + '}\n')
|
||||||
edges.append('{' + "from: '{}', to: '{}'".format(key, value) + '}')
|
|
||||||
|
|
||||||
|
return nodes, edges
|
||||||
|
|
||||||
|
def graph(url):
|
||||||
|
obj = Crawler()
|
||||||
|
obj.run(url, 5000)
|
||||||
|
|
||||||
|
current = os.path.dirname(__file__)
|
||||||
|
n, e = obj.getNodesEdges()
|
||||||
with open(os.path.join(current, './cached/' + url.rsplit('/')[2] + '.json'), 'w', encoding='utf-8') as f:
|
with open(os.path.join(current, './cached/' + url.rsplit('/')[2] + '.json'), 'w', encoding='utf-8') as f:
|
||||||
f.write(json.dumps({"nodes": nodes,"edges": edges}))
|
f.write(json.dumps({"nodes": n,"edges": e}))
|
||||||
|
|
||||||
|
nodes, edges = transformForDrawing(n, e)
|
||||||
return nodes, edges
|
return nodes, edges
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -48,6 +50,7 @@ def load(url):
|
||||||
jsonContent = json.loads(content)
|
jsonContent = json.loads(content)
|
||||||
nodes = jsonContent["nodes"]
|
nodes = jsonContent["nodes"]
|
||||||
edges = jsonContent["edges"]
|
edges = jsonContent["edges"]
|
||||||
|
nodes, edges = transformForDrawing(nodes, edges)
|
||||||
return nodes, edges
|
return nodes, edges
|
||||||
|
|
||||||
#----------------------------------------------------------------------------#
|
#----------------------------------------------------------------------------#
|
||||||
|
|
@ -57,8 +60,7 @@ def load(url):
|
||||||
|
|
||||||
@app.route('/')
|
@app.route('/')
|
||||||
def index():
|
def index():
|
||||||
url = "beauty"
|
url = request.args.get("url")
|
||||||
|
|
||||||
cached = os.listdir(os.path.join(os.path.dirname(__file__), "./cached"))
|
cached = os.listdir(os.path.join(os.path.dirname(__file__), "./cached"))
|
||||||
withoutProtocol = url
|
withoutProtocol = url
|
||||||
if withoutProtocol + '.json' not in cached:
|
if withoutProtocol + '.json' not in cached:
|
||||||
|
|
@ -69,9 +71,8 @@ def index():
|
||||||
str1 = ","
|
str1 = ","
|
||||||
nodes = str1.join(nodes)
|
nodes = str1.join(nodes)
|
||||||
edges = str1.join(edges)
|
edges = str1.join(edges)
|
||||||
with open("./templates/data.js", "w") as f:
|
|
||||||
f.write(f"var nodes={nodes}\nvar=edges={edges}")
|
return render_template('graph.html', nodes = nodes, edges = edges)
|
||||||
return render_template('graph.html')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
||||||
|
|
@ -1 +0,0 @@
|
||||||
{"nodes": ["{id: 'https://budgetbytes.lpages.co/budget-bytes-app/', label: 'budget-bytes-app', group: 0}", "{id: 'https://itunes.apple.com/app/apple-store/id1240079167?pt=95954917&ct=Landing%20page&mt=8', label: 'https://itunes.apple.com/app/apple-store/id1240079167?pt=95954917&ct=Landing%20page&mt=8', group: 1}", "{id: 'https://goo.gl/wiZ4FQ', label: 'https://goo.gl/wiZ4FQ', group: 1}", "{id: 'mailto:budgetbytesapp@sidechef.com', label: 'mailto:budgetbytesapp@sidechef.com', group: 1}", "{id: 'https://www.budgetbytes.com/', label: 'https://www.budgetbytes.com/', group: 1}", "{id: 'https://www.sidechef.com/', label: 'https://www.sidechef.com/', group: 1}"], "edges": ["{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://itunes.apple.com/app/apple-store/id1240079167?pt=95954917&ct=Landing%20page&mt=8'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://goo.gl/wiZ4FQ'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'mailto:budgetbytesapp@sidechef.com'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://www.budgetbytes.com/'}", "{from: 'https://budgetbytes.lpages.co/budget-bytes-app/', to: 'https://www.sidechef.com/'}"]}
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1,7 +1,7 @@
|
||||||
<html>
|
<html>
|
||||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.js"></script>
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.js"></script>
|
||||||
<link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.css">
|
<link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/vis/4.21.0/vis.min.css">
|
||||||
<div id="mynetwork" style = "background-color: grey;"></div>
|
<div id="mynetwork" style = "background-color: rgb(39, 39, 39);"></div>
|
||||||
<script type="text/javascript">
|
<script type="text/javascript">
|
||||||
var color = 'gray';
|
var color = 'gray';
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue