64 lines
1.9 KiB
Python
64 lines
1.9 KiB
Python
|
|
|
||
|
|
from urllib.parse import urljoin
|
||
|
|
from lxml import html
|
||
|
|
import requests
|
||
|
|
import json
|
||
|
|
from time import sleep
|
||
|
|
import random
|
||
|
|
|
||
|
|
|
||
|
|
def getLinks():
|
||
|
|
header_values = {
|
||
|
|
'name': 'Michael Foord',
|
||
|
|
'location': 'Northampton',
|
||
|
|
'language': 'English',
|
||
|
|
'User-Agent': 'Mozilla 4/0',
|
||
|
|
'Accept-Encoding': 'gzip',
|
||
|
|
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
|
||
|
|
'Upgrade-Insecure-Requests': '0',
|
||
|
|
'Referrer': 'https://www.google.com/'
|
||
|
|
}
|
||
|
|
|
||
|
|
links = []
|
||
|
|
with requests.Session() as session:
|
||
|
|
root = "https://www.chefkoch.de/rs/s0/Rezepte.html"
|
||
|
|
site = session.get(root, headers=header_values)
|
||
|
|
tree = html.fromstring(site.content)
|
||
|
|
|
||
|
|
# converts: 344.621 Ergebnisse to int(344621)
|
||
|
|
max = int(tree.xpath(
|
||
|
|
'/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
|
||
|
|
|
||
|
|
for i in range(0, max, 30):
|
||
|
|
try:
|
||
|
|
root = "https://www.chefkoch.de/rs/s" + \
|
||
|
|
str(i) + "/Rezepte.html"
|
||
|
|
site = session.get(root, headers=header_values)
|
||
|
|
tree = html.fromstring(site.content)
|
||
|
|
|
||
|
|
# converts: 344.621 Ergebnisse to int(344621)
|
||
|
|
max = int(tree.xpath(
|
||
|
|
'/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
|
||
|
|
# only add new links
|
||
|
|
for x in tree.xpath('/html/body/main/article/a/@href'):
|
||
|
|
if x not in links:
|
||
|
|
links.append(x)
|
||
|
|
print(i)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
# retry after 3 seconds
|
||
|
|
print(e)
|
||
|
|
i -= 30
|
||
|
|
sleep(10)
|
||
|
|
|
||
|
|
sleep(random.randint(0, 5))
|
||
|
|
|
||
|
|
print(links)
|
||
|
|
return links
|
||
|
|
|
||
|
|
|
||
|
|
links = getLinks()
|
||
|
|
with open('./data/links.json', 'w') as file:
|
||
|
|
jsonString = json.dumps(links)
|
||
|
|
file.write(jsonString)
|