Inverse-Rezeptsuche/mine.py

139 lines
4.7 KiB
Python

# -*- coding: utf-8 -*-
from urllib.parse import urljoin
from lxml import html
import requests
import json
from time import sleep
import random
import traceback
import cv2
import base64
from application.db import Session, Recipe, Ingredient, Link, Trunk
header_values = {
'name': 'Michael Foord',
'location': 'Northampton',
'language': 'English',
'User-Agent': 'Mozilla 4/0',
'Accept-Encoding': 'gzip',
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
'Upgrade-Insecure-Requests': '0',
'Referrer': 'https://www.google.com/'
}
def getLinks():
links = []
with requests.Session() as session:
root = "https://www.chefkoch.de/rs/s0/Rezepte.html"
site = session.get(root, headers=header_values)
tree = html.fromstring(site.content)
# converts: 344.621 Ergebnisse to int(344621)
#max = int(tree.xpath(
# '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
max = 2000 # get 2000 recepies :)
for i in range(0, max, 30):
try:
root = "https://www.chefkoch.de/rs/s" + \
str(i) + "/Rezepte.html"
site = session.get(root, headers=header_values)
tree = html.fromstring(site.content)
# converts: 344.621 Ergebnisse to int(344621)
max = int(tree.xpath(
'/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
# only add new links
for x in tree.xpath('/html/body/main/article/a/@href'):
if x not in links:
links.append(x)
print(i)
except Exception as e:
# retry after 3 seconds
print(e)
i -= 30
sleep(10)
sleep(random.randint(0, 5))
print(links)
return links
def getRecipe(links):
recs = dict()
with requests.Session() as session:
counter = 0
for link in links:
counter += 1
try:
site = session.get(link, headers=header_values)
tree = html.fromstring(site.content)
namePath = "/html/body/main/article[1]/div/div[2]/h1/text()"
ingredPath = "/html/body/main/article[2]/table/tbody/tr/td" # TODO: fix this
recipPath = "/html/body/main/article[3]/div[1]/text()"
imgPath = './data/images.jpeg'
name = tree.xpath(namePath)[0]
ingred = tree.xpath(ingredPath)
resip = tree.xpath(recipPath)
image = cv2.imread(imgPath)
ret, jpeg = cv2.imencode(".jpeg", image)
img = base64.b64encode(jpeg)
resString = ""
for x in resip:
resString += x
dbSession = Session()
r = Recipe(name=name, instructions=resString, url=link, img=img)
ingredDict = {}
for i in range(0, len(ingred)-1, 2):
#print(ingred[i+1][0].text)
if ingred[i+1][0] is not None:
if ingred[i+1][0].text is None:
textFromLink = ingred[i+1][0][0].text.strip().replace(" ", "")
#print(textFromLink)
stuff = textFromLink
else:
stuff = ingred[i+1][0].text.strip().replace(" ", "")
if ingred[i] is not None:
try:
amount = ingred[i][0].text.strip().replace(" ", "")
except:
amount = ""
#print(stuff, amount)
a = Link(ingredient_amount=amount)
a.ingredient = Ingredient(name=stuff)
r.ingredient.append(a)
dbSession.add(r)
dbSession.commit()
ingredDict[stuff] = amount
recs[name] = [resString, ingredDict, link, img.decode("utf-8")]
print("")
except Exception as e:
print(traceback.format_exc())
print(format(counter/len(links), '.2f'), link)
sleep(random.randint(0, 5))
return recs
#links = getLinks()
#with open('./data/links.json', 'w') as file:
# jsonString = json.dumps(links)
# file.write(jsonString)
links = ""
with open('./data/links.json') as file:
links = json.load(file)
recs = getRecipe(links)
with open('./data/recs.json', 'w', encoding="utf-8") as file:
json.dump(recs, file, ensure_ascii=False)