Inverse-Rezeptsuche/app/application/search.py

129 lines
4.2 KiB
Python

import application.db2 as db
from flask import g
import nltk as nltk
from nltk.corpus import stopwords
import time
import heapq
from collections import Counter
import background.migrate
from sqlalchemy import exists
def search2(inputArr):
''' returns inputs with array of recipeID which use them '''
indx = {}
dbSession = db.Session()
for inpu in inputArr:
x = dbSession.query(db.Trunk.name, db.Recipe.recipe_id).filter(db.Trunk.name == inpu).join(db.IngredTrunk).join(
db.Ingredient).join(db.RecIngred).join(db.Recipe).all()
indx[inpu] = [str(y[1]) for y in x]
return(indx)
def stemInput(l1):
''' returns array of stemmed input '''
snowball = nltk.SnowballStemmer(language='german')
stopset = set(stopwords.words('german'))
stopset |= set("(),")
l1 = [snowball.stem(l) for l in l1]
return l1
def findUnrecognized(dbSession, inputArr):
''' check if any input is not in db.Trunk'''
ignored = []
for x in inputArr:
if not dbSession.query(exists().where(db.Trunk.name == x)).scalar():
ignored.append(inputArr.index(x))
return ignored
def getIngredDict(ingred):
''' RezeptID, stemmed Ingred, full ingred Name
Dict spiegelt DB wieder, key, full ingred, stemmed
this structure makes calcOverlay() more efficient '''
ingredDict = {}
for k, v, i in ingred:
if k not in ingredDict:
ingredDict[k] = {}
if i not in ingredDict[k]:
ingredDict[k][i] = []
ingredDict[k][i].append(v)
return ingredDict
def calcOverlay(inputArr, ingredDict):
'''checks overlay per recipeID
iterate over ingreds and checks per stemmed ingred
returns accurate percentage of overlay
since overlay scare is the key of dict it is reduced by insignificant number to preserve all values'''
outDict = {}
for key, value in ingredDict.items():
overlay, missing = calcOverlay2(inputArr, value)
while overlay in outDict.keys():
overlay -= 0.0001
outDict[overlay] = (int(key), missing)
return outDict
def resolvRecipe(dbSession, outDict):
''' return Dict with 20 highest value keys
creates dict which is returned'''
outDict2 = {}
for key in heapq.nlargest(20, outDict.keys()):
key2 = outDict[key][0]
missing = outDict[key][1]
rec = dbSession.query(db.Recipe).filter(
db.Recipe.recipe_id == key2).first()
outDict2[key] = (key2, rec.name, rec.url, [r[0] + ": " + r[1] for r in dbSession.query(db.Ingredient.name,
db.RecIngred.ingredient_amount).join(db.RecIngred).join(db.Recipe).filter(db.Recipe.recipe_id == key2).all()], missing)
return outDict2
def getRecDict2(indx, inputArr):
'''returns dict with percentage of overlay as keys and recipes as values'''
dbSession = db.Session()
# 2d to 1d
indx = sum(indx.values(), [])
k = Counter(indx)
# keep 1000 most relevant to have consistent query time
indx = k.most_common(1000)
indx = dict(indx)
# get ingredients for all recipes
ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter(
db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()]
ingredDict = getIngredDict(ingred)
ignored = findUnrecognized(dbSession, inputArr)
inputArr += stemInput(defaultArr)
outDict = calcOverlay(inputArr, ingredDict)
ingreds = resolvRecipe(dbSession, outDict)
outDict = {}
outDict["ignored"] = ignored
outDict["ingred"] = ingreds
return outDict
def calcOverlay2(l1, l2):
'''Calculates overlay and returns missing ingredients, [score (float), missing([])]'''
counter = 0
notIn = []
for key, ll in l2.items():
missing = True
for l in ll:
if l in l1:
counter += 1
missing = False
break
if missing:
notIn.append(key)
counter = counter / len(l2)
return counter, notIn
# it is assumed that everyone has this
defaultArr = ["wasser", "salz", "pfeffer"]