Inverse-Rezeptsuche/mine.py

# -*- coding: utf-8 -*-
from urllib.parse import urljoin
from lxml import html
import requests
import json
from time import sleep
import random
import traceback
import cv2
import base64
from application.db import Session, Recipe, Ingredient, Link, Trunk

header_values = {
    'name': 'Michael Foord',
    'location': 'Northampton',
    'language': 'English',
    'User-Agent': 'Mozilla 4/0',
    'Accept-Encoding': 'gzip',
    'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
    'Upgrade-Insecure-Requests': '0',
    'Referrer': 'https://www.google.com/'
}

def getLinks():
    links = []
    with requests.Session() as session:
        root = "https://www.chefkoch.de/rs/s0/Rezepte.html"
        site = session.get(root,  headers=header_values)
        tree = html.fromstring(site.content)

        # converts: 344.621 Ergebnisse to int(344621)
        #max = int(tree.xpath(
        #    '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
        max = 2000 # get 2000 recepies :)
        for i in range(0, max, 30):
            try:
                root = "https://www.chefkoch.de/rs/s" + \
                    str(i) + "/Rezepte.html"
                site = session.get(root,  headers=header_values)
                tree = html.fromstring(site.content)

                # converts: 344.621 Ergebnisse to int(344621)
                max = int(tree.xpath(
                    '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
                # only add new links
                for x in tree.xpath('/html/body/main/article/a/@href'):
                    if x not in links:
                        links.append(x)
                print(i)

            except Exception as e:
                # retry after 3 seconds
                print(e)
                i -= 30
                sleep(10)

            sleep(random.randint(0, 5))

        print(links)
    return links

def getRecipe(links):
    recs = dict()
    with requests.Session() as session:
        counter = 0
        for link in links:
            counter += 1
            try:
                site = session.get(link,  headers=header_values)
                tree = html.fromstring(site.content)

                namePath = "/html/body/main/article[1]/div/div[2]/h1/text()"
                ingredPath = "/html/body/main/article[2]/table/tbody/tr/td" # TODO: fix this
                recipPath = "/html/body/main/article[3]/div[1]/text()"
                imgPath = './data/images.jpeg'

                name = tree.xpath(namePath)[0]
                ingred = tree.xpath(ingredPath)
                resip = tree.xpath(recipPath)

                image = cv2.imread(imgPath)
                ret, jpeg = cv2.imencode(".jpeg", image)
                img = base64.b64encode(jpeg)

                resString = ""
                for x in resip:
                    resString += x

                dbSession = Session()

                r = Recipe(name=name, instructions=resString, url=link, img=img)

                ingredDict = {}
                for i in range(0, len(ingred)-1, 2):
                    #print(ingred[i+1][0].text)
                    if ingred[i+1][0] is not None:
                        if ingred[i+1][0].text is None:
                            textFromLink = ingred[i+1][0][0].text.strip().replace("  ", "")
                            #print(textFromLink)
                            stuff = textFromLink
                        else:
                            stuff = ingred[i+1][0].text.strip().replace("  ", "")

                    if ingred[i] is not None:
                        try:
                            amount = ingred[i][0].text.strip().replace("  ", "")
                        except:
                            amount = ""
                    #print(stuff, amount)
                    a = Link(ingredient_amount=amount)
                    a.ingredient = Ingredient(name=stuff)
                    r.ingredient.append(a)
                    dbSession.add(r)
                    dbSession.commit()

                    ingredDict[stuff] = amount
                recs[name] = [resString, ingredDict, link, img.decode("utf-8")]
                print("")
            except Exception as e:
                print(traceback.format_exc())

            print(format(counter/len(links), '.2f'), link)
            sleep(random.randint(0, 5))
    return recs

#links = getLinks()
#with open('./data/links.json', 'w') as file:
#    jsonString = json.dumps(links)
#    file.write(jsonString)
links = ""
with open('./data/links.json') as file:
    links = json.load(file)


recs = getRecipe(links)

with open('./data/recs.json', 'w', encoding="utf-8") as file:
    json.dump(recs, file, ensure_ascii=False)