139 lines
4.7 KiB
Python
139 lines
4.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
from urllib.parse import urljoin
|
|
from lxml import html
|
|
import requests
|
|
import json
|
|
from time import sleep
|
|
import random
|
|
import traceback
|
|
import cv2
|
|
import base64
|
|
from application.db import Session, Recipe, Ingredient, Link
|
|
|
|
header_values = {
|
|
'name': 'Michael Foord',
|
|
'location': 'Northampton',
|
|
'language': 'English',
|
|
'User-Agent': 'Mozilla 4/0',
|
|
'Accept-Encoding': 'gzip',
|
|
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
|
|
'Upgrade-Insecure-Requests': '0',
|
|
'Referrer': 'https://www.google.com/'
|
|
}
|
|
|
|
def getLinks():
|
|
links = []
|
|
with requests.Session() as session:
|
|
root = "https://www.chefkoch.de/rs/s0/Rezepte.html"
|
|
site = session.get(root, headers=header_values)
|
|
tree = html.fromstring(site.content)
|
|
|
|
# converts: 344.621 Ergebnisse to int(344621)
|
|
#max = int(tree.xpath(
|
|
# '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
|
|
max = 2000 # get 2000 recepies :)
|
|
for i in range(0, max, 30):
|
|
try:
|
|
root = "https://www.chefkoch.de/rs/s" + \
|
|
str(i) + "/Rezepte.html"
|
|
site = session.get(root, headers=header_values)
|
|
tree = html.fromstring(site.content)
|
|
|
|
# converts: 344.621 Ergebnisse to int(344621)
|
|
max = int(tree.xpath(
|
|
'/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
|
|
# only add new links
|
|
for x in tree.xpath('/html/body/main/article/a/@href'):
|
|
if x not in links:
|
|
links.append(x)
|
|
print(i)
|
|
|
|
except Exception as e:
|
|
# retry after 3 seconds
|
|
print(e)
|
|
i -= 30
|
|
sleep(10)
|
|
|
|
sleep(random.randint(0, 5))
|
|
|
|
print(links)
|
|
return links
|
|
|
|
def getRecipe(links):
|
|
recs = dict()
|
|
with requests.Session() as session:
|
|
counter = 0
|
|
for link in links:
|
|
counter += 1
|
|
try:
|
|
site = session.get(link, headers=header_values)
|
|
tree = html.fromstring(site.content)
|
|
|
|
namePath = "/html/body/main/article[1]/div/div[2]/h1/text()"
|
|
ingredPath = "/html/body/main/article[2]/table/tbody/tr/td" # TODO: fix this
|
|
recipPath = "/html/body/main/article[3]/div[1]/text()"
|
|
imgPath = './data/images.jpeg'
|
|
|
|
name = tree.xpath(namePath)[0]
|
|
ingred = tree.xpath(ingredPath)
|
|
resip = tree.xpath(recipPath)
|
|
|
|
image = cv2.imread(imgPath)
|
|
ret, jpeg = cv2.imencode(".jpeg", image)
|
|
img = base64.b64encode(jpeg)
|
|
|
|
resString = ""
|
|
for x in resip:
|
|
resString += x
|
|
|
|
dbSession = Session()
|
|
|
|
r = Recipe(name=name, instructions=resString, url=link, img=img)
|
|
|
|
ingredDict = {}
|
|
for i in range(0, len(ingred)-1, 2):
|
|
#print(ingred[i+1][0].text)
|
|
if ingred[i+1][0] is not None:
|
|
if ingred[i+1][0].text is None:
|
|
textFromLink = ingred[i+1][0][0].text.strip().replace(" ", "")
|
|
#print(textFromLink)
|
|
stuff = textFromLink
|
|
else:
|
|
stuff = ingred[i+1][0].text.strip().replace(" ", "")
|
|
|
|
if ingred[i] is not None:
|
|
try:
|
|
amount = ingred[i][0].text.strip().replace(" ", "")
|
|
except:
|
|
amount = ""
|
|
#print(stuff, amount)
|
|
a = Link(ingredient_amount=amount)
|
|
a.ingredient = Ingredient(name=stuff)
|
|
r.ingredient.append(a)
|
|
dbSession.add(r)
|
|
dbSession.commit()
|
|
|
|
ingredDict[stuff] = amount
|
|
recs[name] = [resString, ingredDict, link, img.decode("utf-8")]
|
|
print("")
|
|
except Exception as e:
|
|
print(traceback.format_exc())
|
|
|
|
print(format(counter/len(links), '.2f'), link)
|
|
sleep(random.randint(0, 5))
|
|
return recs
|
|
|
|
#links = getLinks()
|
|
#with open('./data/links.json', 'w') as file:
|
|
# jsonString = json.dumps(links)
|
|
# file.write(jsonString)
|
|
links = ""
|
|
with open('./data/links.json') as file:
|
|
links = json.load(file)
|
|
|
|
|
|
recs = getRecipe(links)
|
|
|
|
with open('./data/recs.json', 'w', encoding="utf-8") as file:
|
|
json.dump(recs, file, ensure_ascii=False)
|
|
|