Inverse-Rezeptsuche/app/background/mine.py

163 lines
5.4 KiB
Python

# -*- coding: utf-8 -*-
from urllib.parse import urljoin
from lxml import html
import requests
import json
from time import sleep
import random
import traceback
import cv2
import base64
from application.db import Session, Recipe, Ingredient, Trunk
import nltk as nltk
from nltk.corpus import stopwords
header_values = {
'name': 'Michael Foord',
'location': 'Northampton',
'language': 'English',
'User-Agent': 'Mozilla 4/0',
'Accept-Encoding': 'gzip',
'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
'Upgrade-Insecure-Requests': '0',
'Referrer': 'https://www.google.com/'
}
def getLinks():
links = []
with requests.Session() as session:
root = "https://www.chefkoch.de/rs/s0/Rezepte.html"
site = session.get(root, headers=header_values)
tree = html.fromstring(site.content)
# converts: 344.621 Ergebnisse to int(344621)
#max = int(tree.xpath(
# '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
max = 10000 # get 2000 recepies :)
for i in range(2000, max, 30):
try:
root = "https://www.chefkoch.de/rs/s" + \
str(i) + "/Rezepte.html"
site = session.get(root, headers=header_values)
tree = html.fromstring(site.content)
# converts: 344.621 Ergebnisse to int(344621)
max = int(tree.xpath(
'/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
# only add new links
for x in tree.xpath('/html/body/main/article/a/@href'):
if x not in links:
links.append(x)
print(i)
except Exception as e:
# retry after 3 seconds
print(e)
i -= 30
sleep(10)
sleep(random.randint(1, 4))
print(links)
return links
def getRecipe(links):
recs = dict()
with requests.Session() as session:
counter = 0
for link in links:
counter += 1
try:
site = session.get(link, headers=header_values)
tree = html.fromstring(site.content)
namePath = "/html/body/main/article[1]/div/div[2]/h1/text()"
ingredPath = "/html/body/main/article[2]/table/tbody/tr/td" # TODO: fix this
recipPath = "/html/body/main/article[3]/div[1]/text()"
imgPath = './data/images.jpeg'
name = tree.xpath(namePath)[0]
ingred = tree.xpath(ingredPath)
resip = tree.xpath(recipPath)
image = cv2.imread(imgPath)
ret, jpeg = cv2.imencode(".jpeg", image)
img = base64.b64encode(jpeg)
resString = ""
for x in resip:
resString += x
dbSession = Session()
r = Recipe(name=name, instructions=resString, url=link, img=img)
ingredDict = {}
for i in range(0, len(ingred)-1, 2):
#print(ingred[i+1][0].text)
if ingred[i+1][0] is not None:
if ingred[i+1][0].text is None:
textFromLink = ingred[i+1][0][0].text.strip().replace(" ", "")
#print(textFromLink)
stuff = textFromLink
else:
stuff = ingred[i+1][0].text.strip().replace(" ", "")
if ingred[i] is not None:
try:
amount = ingred[i][0].text.strip().replace(" ", "")
except:
amount = ""
#print(stuff, amount)
#a = Link(ingredient_amount=amount)
#a.ingredient = Ingredient(name=stuff)
#r.ingredient.append(a)
#dbSession.add(r)
#dbSession.commit()
ingredDict[stuff] = amount
recs[name] = [resString, ingredDict, link, img.decode("utf-8")]
if counter % 20 == 0:
print(counter)
except Exception as e:
print(traceback.format_exc())
print(format(counter/len(links), '.2f'), link)
sleep(random.randint(0, 6))
return recs
def stemIngred():
dbSession = Session()
stopset = set(stopwords.words('german'))
stopset |= set("(),")
count = dbSession.query(Ingredient).count()
for x in dbSession.query(Ingredient).all():
snowball = nltk.SnowballStemmer(language='german')
for token in nltk.word_tokenize(x.name):
if token in stopset or len(token) < 4:
continue
stemmed = snowball.stem(token)
x.trunks.append(Trunk(name=stemmed))
dbSession.commit()
print(x.ingredient_id/count)
#links = getLinks()
#with open('./data/links.json', 'w') as file:
# jsonString = json.dumps(links)
# file.write(jsonString)
#links = ""
#with open('./data/links.json') as file:
# links = json.load(file)
#recs = getRecipe(links)
#stemIngred()
#with open('./data/recs.json', 'w', encoding="utf-8") as file:
# json.dump(recs, file, ensure_ascii=False)