Inverse-Rezeptsuche/mine.py

119 lines
3.9 KiB
Python
Raw Normal View History

2020-04-03 17:29:15 +00:00
from urllib.parse import urljoin
from lxml import html
import requests
import json
from time import sleep
import random
2020-04-05 12:05:45 +00:00
header_values = {
'name': 'Michael Foord',
'location': 'Northampton',
'language': 'English',
'User-Agent': 'Mozilla 4/0',
'Accept-Encoding': 'gzip',
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
'Upgrade-Insecure-Requests': '0',
'Referrer': 'https://www.google.com/'
}
2020-04-03 17:29:15 +00:00
def getLinks():
links = []
with requests.Session() as session:
root = "https://www.chefkoch.de/rs/s0/Rezepte.html"
site = session.get(root, headers=header_values)
tree = html.fromstring(site.content)
# converts: 344.621 Ergebnisse to int(344621)
2020-04-05 12:05:45 +00:00
#max = int(tree.xpath(
# '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
max = 2000 # get 2000 recepies :)
2020-04-03 17:29:15 +00:00
for i in range(0, max, 30):
try:
root = "https://www.chefkoch.de/rs/s" + \
str(i) + "/Rezepte.html"
site = session.get(root, headers=header_values)
tree = html.fromstring(site.content)
# converts: 344.621 Ergebnisse to int(344621)
max = int(tree.xpath(
'/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
# only add new links
for x in tree.xpath('/html/body/main/article/a/@href'):
if x not in links:
links.append(x)
print(i)
except Exception as e:
# retry after 3 seconds
print(e)
i -= 30
sleep(10)
sleep(random.randint(0, 5))
print(links)
return links
2020-04-05 12:05:45 +00:00
def getRecipe(links):
recs = dict()
with requests.Session() as session:
for link in links:
try:
site = session.get(link, headers=header_values)
tree = html.fromstring(site.content)
namePath = "/html/body/main/article[1]/div/div[2]/h1/text()"
ingredPath = "/html/body/main/article[2]/table/tbody/tr/td" # TODO: fix this
2020-04-05 12:05:45 +00:00
recipPath = "/html/body/main/article[3]/div[1]/text()"
name = tree.xpath(namePath)[0]
ingred = tree.xpath(ingredPath)
resip = tree.xpath(recipPath)
2020-04-05 12:05:45 +00:00
resString = ""
for x in resip:
resString += x + "\n"
2020-04-05 12:05:45 +00:00
ingredDict = {}
2020-04-05 12:05:45 +00:00
for i in range(0, len(ingred)-1, 2):
#print(ingred[i+1][0].text)
if ingred[i+1][0].text is not None:
if ingred[i+1][0].text is None:
stuff = ingred[i+1][0][0].text.strip().replace(" ", "")
else:
stuff = ingred[i+1][0].text.strip().replace(" ", "")
else:
stuff = ""
if ingred[i][0].text is not None:
if ingred[i][0].text is None:
amount = ingred[i][0][0].text.strip().replace(" ", "")
else:
amount = ingred[i][0].text.strip().replace(" ", "")
else:
amount = ""
ingredDict[stuff] = amount
2020-04-05 12:05:45 +00:00
recs[name] = [resString, ingredDict]
print("")
except Exception as e:
print(e)
print(link)
2020-04-05 12:05:45 +00:00
sleep(random.randint(0, 5))
return recs
2020-04-05 12:05:45 +00:00
#links = getLinks()
#with open('./data/links.json', 'w') as file:
# jsonString = json.dumps(links)
# file.write(jsonString)
links = ""
2020-04-05 12:05:45 +00:00
with open('./data/links.json') as file:
links = json.load(file)
2020-04-03 17:29:15 +00:00
recs = getRecipe(links)
with open('./data/recs.json', 'w') as file:
jsonString = json.dumps(recs)
file.write(jsonString)