Inverse-Rezeptsuche/mine.py

# -*- coding: utf-8 -*-
from urllib.parse import urljoin
from lxml import html
import requests
import json
from time import sleep
import random
import traceback
import cv2
import base64
from application.db import Session, Recipe, Ingredient, Link

header_values = {
    'name': 'Michael Foord',
    'location': 'Northampton',
    'language': 'English',
    'User-Agent': 'Mozilla 4/0',
    'Accept-Encoding': 'gzip',
    'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
    'Upgrade-Insecure-Requests': '0',
    'Referrer': 'https://www.google.com/'
}

def getLinks():
    links = []
    with requests.Session() as session:
        root = "https://www.chefkoch.de/rs/s0/Rezepte.html"
        site = session.get(root,  headers=header_values)
        tree = html.fromstring(site.content)

        # converts: 344.621 Ergebnisse to int(344621)
        #max = int(tree.xpath(
        #    '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
        max = 2000 # get 2000 recepies :)
        for i in range(0, max, 30):
            try:
                root = "https://www.chefkoch.de/rs/s" + \
                    str(i) + "/Rezepte.html"
                site = session.get(root,  headers=header_values)
                tree = html.fromstring(site.content)

                # converts: 344.621 Ergebnisse to int(344621)
                max = int(tree.xpath(
                    '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
                # only add new links
                for x in tree.xpath('/html/body/main/article/a/@href'):
                    if x not in links:
                        links.append(x)
                print(i)

            except Exception as e:
                # retry after 3 seconds
                print(e)
                i -= 30
                sleep(10)

            sleep(random.randint(0, 5))

        print(links)
    return links

def getRecipe(links):
    recs = dict()
    with requests.Session() as session:
        counter = 0
        for link in links:
            counter += 1
            try:
                site = session.get(link,  headers=header_values)
                tree = html.fromstring(site.content)

                namePath = "/html/body/main/article[1]/div/div[2]/h1/text()"
                ingredPath = "/html/body/main/article[2]/table/tbody/tr/td" # TODO: fix this
                recipPath = "/html/body/main/article[3]/div[1]/text()"
                imgPath = './data/images.jpeg'

                name = tree.xpath(namePath)[0]
                ingred = tree.xpath(ingredPath)
                resip = tree.xpath(recipPath)

                image = cv2.imread(imgPath)
                ret, jpeg = cv2.imencode(".jpeg", image)
                img = base64.b64encode(jpeg)

                resString = ""
                for x in resip:
                    resString += x 

                dbSession = Session()
                
                r = Recipe(name=name, instructions=resString, url=link, img=img)
                
                ingredDict = {}
                for i in range(0, len(ingred)-1, 2):
                    #print(ingred[i+1][0].text)
                    if ingred[i+1][0] is not None:
                        if ingred[i+1][0].text is None:
                            textFromLink = ingred[i+1][0][0].text.strip().replace("  ", "")
                            #print(textFromLink)
                            stuff = textFromLink
                        else:
                            stuff = ingred[i+1][0].text.strip().replace("  ", "")

                    if ingred[i] is not None:
                        try:
                            amount = ingred[i][0].text.strip().replace("  ", "")
                        except:
                            amount = ""
                    #print(stuff, amount)
                    a = Link(ingredient_amount=amount)
                    a.ingredient = Ingredient(name=stuff)
                    r.ingredient.append(a)
                    dbSession.add(r)
                    dbSession.commit()
                    
                    ingredDict[stuff] = amount
                recs[name] = [resString, ingredDict, link, img.decode("utf-8")]
                print("")
            except Exception as e:
                print(traceback.format_exc())
                
            print(format(counter/len(links), '.2f'), link)
            sleep(random.randint(0, 5))
    return recs

#links = getLinks()
#with open('./data/links.json', 'w') as file:
#    jsonString = json.dumps(links)
#    file.write(jsonString)
links = ""
with open('./data/links.json') as file:
    links = json.load(file)
    

recs = getRecipe(links)

with open('./data/recs.json', 'w', encoding="utf-8") as file:
    json.dump(recs, file, ensure_ascii=False)
encoding works 2020-04-09 12:26:24 +00:00			`# -- coding: utf-8 --`
started mining 2020-04-03 17:29:15 +00:00			`from urllib.parse import urljoin`
			`from lxml import html`
			`import requests`
			`import json`
			`from time import sleep`
			`import random`
encoding works 2020-04-09 12:26:24 +00:00			`import traceback`
orm mining works 2020-04-10 14:14:34 +00:00			`import cv2`
			`import base64`
			`from application.db import Session, Recipe, Ingredient, Link`
started mining 2020-04-03 17:29:15 +00:00
funkt fast 2020-04-05 12:05:45 +00:00			`header_values = {`
			`'name': 'Michael Foord',`
			`'location': 'Northampton',`
			`'language': 'English',`
			`'User-Agent': 'Mozilla 4/0',`
			`'Accept-Encoding': 'gzip',`
			`'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',`
			`'Upgrade-Insecure-Requests': '0',`
			`'Referrer': 'https://www.google.com/'`
			`}`
started mining 2020-04-03 17:29:15 +00:00
			`def getLinks():`
			`links = []`
			`with requests.Session() as session:`
			`root = "https://www.chefkoch.de/rs/s0/Rezepte.html"`
			`site = session.get(root, headers=header_values)`
			`tree = html.fromstring(site.content)`

			`# converts: 344.621 Ergebnisse to int(344621)`
funkt fast 2020-04-05 12:05:45 +00:00			`#max = int(tree.xpath(`
			`# '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))`
			`max = 2000 # get 2000 recepies :)`
started mining 2020-04-03 17:29:15 +00:00			`for i in range(0, max, 30):`
			`try:`
			`root = "https://www.chefkoch.de/rs/s" + \`
			`str(i) + "/Rezepte.html"`
			`site = session.get(root, headers=header_values)`
			`tree = html.fromstring(site.content)`

			`# converts: 344.621 Ergebnisse to int(344621)`
			`max = int(tree.xpath(`
			`'/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))`
			`# only add new links`
			`for x in tree.xpath('/html/body/main/article/a/@href'):`
			`if x not in links:`
			`links.append(x)`
			`print(i)`

			`except Exception as e:`
			`# retry after 3 seconds`
			`print(e)`
			`i -= 30`
			`sleep(10)`

			`sleep(random.randint(0, 5))`

			`print(links)`
			`return links`

funkt fast 2020-04-05 12:05:45 +00:00			`def getRecipe(links):`
			`recs = dict()`
			`with requests.Session() as session:`
encoding works 2020-04-09 12:26:24 +00:00			`counter = 0`
orm mining works 2020-04-10 14:14:34 +00:00			`for link in links:`
encoding works 2020-04-09 12:26:24 +00:00			`counter += 1`
funkt fast 2020-04-05 12:05:45 +00:00			`try:`
			`site = session.get(link, headers=header_values)`
			`tree = html.fromstring(site.content)`

			`namePath = "/html/body/main/article[1]/div/div[2]/h1/text()"`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`ingredPath = "/html/body/main/article[2]/table/tbody/tr/td" # TODO: fix this`
funkt fast 2020-04-05 12:05:45 +00:00			`recipPath = "/html/body/main/article[3]/div[1]/text()"`
orm mining works 2020-04-10 14:14:34 +00:00			`imgPath = './data/images.jpeg'`
funkt fast 2020-04-05 12:05:45 +00:00
			`name = tree.xpath(namePath)[0]`
			`ingred = tree.xpath(ingredPath)`
			`resip = tree.xpath(recipPath)`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00
orm mining works 2020-04-10 14:14:34 +00:00			`image = cv2.imread(imgPath)`
			`ret, jpeg = cv2.imencode(".jpeg", image)`
			`img = base64.b64encode(jpeg)`

funkt fast 2020-04-05 12:05:45 +00:00			`resString = ""`
			`for x in resip:`
orm mining works 2020-04-10 14:14:34 +00:00			`resString += x`
funkt fast 2020-04-05 12:05:45 +00:00
orm mining works 2020-04-10 14:14:34 +00:00			`dbSession = Session()`

			`r = Recipe(name=name, instructions=resString, url=link, img=img)`

rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`ingredDict = {}`
funkt fast 2020-04-05 12:05:45 +00:00			`for i in range(0, len(ingred)-1, 2):`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`#print(ingred[i+1][0].text)`
encoding works 2020-04-09 12:26:24 +00:00			`if ingred[i+1][0] is not None:`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`if ingred[i+1][0].text is None:`
encoding works 2020-04-09 12:26:24 +00:00			`textFromLink = ingred[i+1][0][0].text.strip().replace(" ", "")`
			`#print(textFromLink)`
			`stuff = textFromLink`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`else:`
			`stuff = ingred[i+1][0].text.strip().replace(" ", "")`

encoding works 2020-04-09 12:26:24 +00:00			`if ingred[i] is not None:`
			`try:`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`amount = ingred[i][0].text.strip().replace(" ", "")`
encoding works 2020-04-09 12:26:24 +00:00			`except:`
			`amount = ""`
			`#print(stuff, amount)`
orm mining works 2020-04-10 14:14:34 +00:00			`a = Link(ingredient_amount=amount)`
			`a.ingredient = Ingredient(name=stuff)`
			`r.ingredient.append(a)`
			`dbSession.add(r)`
			`dbSession.commit()`

rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`ingredDict[stuff] = amount`
orm mining works 2020-04-10 14:14:34 +00:00			`recs[name] = [resString, ingredDict, link, img.decode("utf-8")]`
funkt fast 2020-04-05 12:05:45 +00:00			`print("")`
			`except Exception as e:`
encoding works 2020-04-09 12:26:24 +00:00			`print(traceback.format_exc())`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00
orm mining works 2020-04-10 14:14:34 +00:00			`print(format(counter/len(links), '.2f'), link)`
funkt fast 2020-04-05 12:05:45 +00:00			`sleep(random.randint(0, 5))`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`return recs`
funkt fast 2020-04-05 12:05:45 +00:00
			`#links = getLinks()`
			`#with open('./data/links.json', 'w') as file:`
			`# jsonString = json.dumps(links)`
			`# file.write(jsonString)`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`links = ""`
funkt fast 2020-04-05 12:05:45 +00:00			`with open('./data/links.json') as file:`
			`links = json.load(file)`

started mining 2020-04-03 17:29:15 +00:00
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`recs = getRecipe(links)`

encoding works 2020-04-09 12:26:24 +00:00			`with open('./data/recs.json', 'w', encoding="utf-8") as file:`
			`json.dump(recs, file, ensure_ascii=False)`