Inverse-Rezeptsuche/mine.py


from urllib.parse import urljoin
from lxml import html
import requests
import json
from time import sleep
import random

header_values = {
    'name': 'Michael Foord',
    'location': 'Northampton',
    'language': 'English',
    'User-Agent': 'Mozilla 4/0',
    'Accept-Encoding': 'gzip',
    'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
    'Upgrade-Insecure-Requests': '0',
    'Referrer': 'https://www.google.com/'
}

def getLinks():
    links = []
    with requests.Session() as session:
        root = "https://www.chefkoch.de/rs/s0/Rezepte.html"
        site = session.get(root,  headers=header_values)
        tree = html.fromstring(site.content)

        # converts: 344.621 Ergebnisse to int(344621)
        #max = int(tree.xpath(
        #    '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
        max = 2000 # get 2000 recepies :)
        for i in range(0, max, 30):
            try:
                root = "https://www.chefkoch.de/rs/s" + \
                    str(i) + "/Rezepte.html"
                site = session.get(root,  headers=header_values)
                tree = html.fromstring(site.content)

                # converts: 344.621 Ergebnisse to int(344621)
                max = int(tree.xpath(
                    '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
                # only add new links
                for x in tree.xpath('/html/body/main/article/a/@href'):
                    if x not in links:
                        links.append(x)
                print(i)

            except Exception as e:
                # retry after 3 seconds
                print(e)
                i -= 30
                sleep(10)

            sleep(random.randint(0, 5))

        print(links)
    return links

def getRecipe(links):
    recs = dict()
    with requests.Session() as session:
        for link in links:
            try:
                site = session.get(link,  headers=header_values)
                tree = html.fromstring(site.content)

                namePath = "/html/body/main/article[1]/div/div[2]/h1/text()"
                ingredPath = "/html/body/main/article[2]/table/tbody/tr/td" # TODO: fix this
                recipPath = "/html/body/main/article[3]/div[1]/text()"

                name = tree.xpath(namePath)[0]
                ingred = tree.xpath(ingredPath)
                resip = tree.xpath(recipPath)

                resString = ""
                for x in resip:
                    resString += x + "\n"

                ingredDict = {}
                for i in range(0, len(ingred)-1, 2):
                    #print(ingred[i+1][0].text)
                    if ingred[i+1][0].text is not None:
                        if ingred[i+1][0].text is None:
                            stuff = ingred[i+1][0][0].text.strip().replace("  ", "")
                        else:
                            stuff = ingred[i+1][0].text.strip().replace("  ", "")
                    else:
                        stuff = ""

                    if ingred[i][0].text is not None:
                        if ingred[i][0].text is None:
                            amount = ingred[i][0][0].text.strip().replace("  ", "")
                        else:
                            amount = ingred[i][0].text.strip().replace("  ", "")
                    else:
                        amount = ""
                    ingredDict[stuff] = amount
                recs[name] = [resString, ingredDict]
                print("")
            except Exception as e:
                print(e)
                
            print(link)
            sleep(random.randint(0, 5))
    return recs

#links = getLinks()
#with open('./data/links.json', 'w') as file:
#    jsonString = json.dumps(links)
#    file.write(jsonString)
links = ""
with open('./data/links.json') as file:
    links = json.load(file)
    

recs = getRecipe(links)

with open('./data/recs.json', 'w') as file:
    jsonString = json.dumps(recs)
    file.write(jsonString)
started mining 2020-04-03 17:29:15 +00:00
			`from urllib.parse import urljoin`
			`from lxml import html`
			`import requests`
			`import json`
			`from time import sleep`
			`import random`

funkt fast 2020-04-05 12:05:45 +00:00			`header_values = {`
			`'name': 'Michael Foord',`
			`'location': 'Northampton',`
			`'language': 'English',`
			`'User-Agent': 'Mozilla 4/0',`
			`'Accept-Encoding': 'gzip',`
			`'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',`
			`'Upgrade-Insecure-Requests': '0',`
			`'Referrer': 'https://www.google.com/'`
			`}`
started mining 2020-04-03 17:29:15 +00:00
			`def getLinks():`
			`links = []`
			`with requests.Session() as session:`
			`root = "https://www.chefkoch.de/rs/s0/Rezepte.html"`
			`site = session.get(root, headers=header_values)`
			`tree = html.fromstring(site.content)`

			`# converts: 344.621 Ergebnisse to int(344621)`
funkt fast 2020-04-05 12:05:45 +00:00			`#max = int(tree.xpath(`
			`# '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))`
			`max = 2000 # get 2000 recepies :)`
started mining 2020-04-03 17:29:15 +00:00			`for i in range(0, max, 30):`
			`try:`
			`root = "https://www.chefkoch.de/rs/s" + \`
			`str(i) + "/Rezepte.html"`
			`site = session.get(root, headers=header_values)`
			`tree = html.fromstring(site.content)`

			`# converts: 344.621 Ergebnisse to int(344621)`
			`max = int(tree.xpath(`
			`'/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))`
			`# only add new links`
			`for x in tree.xpath('/html/body/main/article/a/@href'):`
			`if x not in links:`
			`links.append(x)`
			`print(i)`

			`except Exception as e:`
			`# retry after 3 seconds`
			`print(e)`
			`i -= 30`
			`sleep(10)`

			`sleep(random.randint(0, 5))`

			`print(links)`
			`return links`

funkt fast 2020-04-05 12:05:45 +00:00			`def getRecipe(links):`
			`recs = dict()`
			`with requests.Session() as session:`
			`for link in links:`
			`try:`
			`site = session.get(link, headers=header_values)`
			`tree = html.fromstring(site.content)`

			`namePath = "/html/body/main/article[1]/div/div[2]/h1/text()"`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`ingredPath = "/html/body/main/article[2]/table/tbody/tr/td" # TODO: fix this`
funkt fast 2020-04-05 12:05:45 +00:00			`recipPath = "/html/body/main/article[3]/div[1]/text()"`

			`name = tree.xpath(namePath)[0]`
			`ingred = tree.xpath(ingredPath)`
			`resip = tree.xpath(recipPath)`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00
funkt fast 2020-04-05 12:05:45 +00:00			`resString = ""`
			`for x in resip:`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`resString += x + "\n"`
funkt fast 2020-04-05 12:05:45 +00:00
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`ingredDict = {}`
funkt fast 2020-04-05 12:05:45 +00:00			`for i in range(0, len(ingred)-1, 2):`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`#print(ingred[i+1][0].text)`
			`if ingred[i+1][0].text is not None:`
			`if ingred[i+1][0].text is None:`
			`stuff = ingred[i+1][0][0].text.strip().replace(" ", "")`
			`else:`
			`stuff = ingred[i+1][0].text.strip().replace(" ", "")`
			`else:`
			`stuff = ""`

			`if ingred[i][0].text is not None:`
			`if ingred[i][0].text is None:`
			`amount = ingred[i][0][0].text.strip().replace(" ", "")`
			`else:`
			`amount = ingred[i][0].text.strip().replace(" ", "")`
			`else:`
			`amount = ""`
			`ingredDict[stuff] = amount`
funkt fast 2020-04-05 12:05:45 +00:00			`recs[name] = [resString, ingredDict]`
			`print("")`
			`except Exception as e:`
			`print(e)`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00
			`print(link)`
funkt fast 2020-04-05 12:05:45 +00:00			`sleep(random.randint(0, 5))`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`return recs`
funkt fast 2020-04-05 12:05:45 +00:00
			`#links = getLinks()`
			`#with open('./data/links.json', 'w') as file:`
			`# jsonString = json.dumps(links)`
			`# file.write(jsonString)`
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`links = ""`
funkt fast 2020-04-05 12:05:45 +00:00			`with open('./data/links.json') as file:`
			`links = json.load(file)`

started mining 2020-04-03 17:29:15 +00:00
rezepte minen klappt fast einige ingred sind links, werden noch nicht ordentlich heruasgefiltert 2020-04-06 20:23:08 +00:00			`recs = getRecipe(links)`

			`with open('./data/recs.json', 'w') as file:`
			`jsonString = json.dumps(recs)`
			`file.write(jsonString)`