slight refactor of the ranking function

This commit is contained in:
Askill 2020-07-07 14:06:13 +02:00
parent 7067130d36
commit a30e99b7ae
2 changed files with 54 additions and 50 deletions

View File

@ -21,64 +21,55 @@ def search2(inputArr):
return(indx) return(indx)
def stemInput(l1):
def stemInput(inputArr):
''' returns array of stemmed input ''' ''' returns array of stemmed input '''
inputArr2 = []
snowball = nltk.SnowballStemmer(language='german') snowball = nltk.SnowballStemmer(language='german')
stopset = set(stopwords.words('german')) stopset = set(stopwords.words('german'))
for word in inputArr: stopset |= set("(),")
if word in stopset: l1 = [snowball.stem(l) for l in l1]
continue return l1
inputArr2.append(snowball.stem(word))
return inputArr2
# TODO: split into more functions
def getRecDict2(indx, inputArr):
dbSession = db.Session()
outDict = {} def findUnrecognized(dbSession, inputArr):
# 2d to 1d ''' check if any input is not in db.Trunk'''
indx = sum(indx.values(), []) ignored = []
k = Counter(indx) for x in inputArr:
indx = k.most_common(1000) if not dbSession.query(exists().where(db.Trunk.name == x)).scalar():
indx = dict(indx) ignored.append(inputArr.index(x))
return ignored
ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter( def getIngredDict(ingred):
db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()] ''' RezeptID, stemmed Ingred, full ingred Name
Dict spiegelt DB wieder, key, full ingred, stemmed
this structure makes calcOverlay() more efficient '''
ingredDict = {} ingredDict = {}
# RezeptID, stemmed Ingred, full ingred Name
# Dict spiegelt DB wieder, key, full ingred, stemmed
# this structure makes calcOverlay() more efficient
for k, v, i in ingred: for k, v, i in ingred:
if k not in ingredDict: if k not in ingredDict:
ingredDict[k] = {} ingredDict[k] = {}
if i not in ingredDict[k]: if i not in ingredDict[k]:
ingredDict[k][i] = [] ingredDict[k][i] = []
ingredDict[k][i].append(v) ingredDict[k][i].append(v)
return ingredDict
# check if any input is not in db.Trunk def calcOverlay(inputArr, ingredDict):
ignored = [] '''checks overlay per recipeID
for x in inputArr: iterate over ingreds and checks per stemmed ingred
if not dbSession.query(exists().where(db.Trunk.name == x)).scalar(): returns accurate percentage of overlay
ignored.append(inputArr.index(x)) since overlay scare is the key of dict it is reduced by insignificant number to preserve all values'''
inputArr += defaultArr outDict = {}
# checks overlay per recipeID
# iterate over ingreds and checks per stemmed ingred
# returns accurate percentage of overlay
# since overlay scare is the key of dict it is reduced by insignificant number to preserve all values
for key, value in ingredDict.items(): for key, value in ingredDict.items():
overlay, missing = calcOverlay2(inputArr, value) overlay, missing = calcOverlay2(inputArr, value)
while overlay in outDict.keys(): while overlay in outDict.keys():
overlay -= 0.0001 overlay -= 0.0001
outDict[overlay] = (int(key), missing) outDict[overlay] = (int(key), missing)
return outDict
# return Dict with 20 highest value keys
# creates dict which is returned def resolvRecipe(dbSession, outDict):
''' return Dict with 20 highest value keys
creates dict which is returned'''
outDict2 = {} outDict2 = {}
for key in heapq.nlargest(20, outDict.keys()): for key in heapq.nlargest(20, outDict.keys()):
key2 = outDict[key][0] key2 = outDict[key][0]
@ -87,20 +78,34 @@ def getRecDict2(indx, inputArr):
db.Recipe.recipe_id == key2).first() db.Recipe.recipe_id == key2).first()
outDict2[key] = (key2, rec.name, rec.url, [r[0] + ": " + r[1] for r in dbSession.query(db.Ingredient.name, outDict2[key] = (key2, rec.name, rec.url, [r[0] + ": " + r[1] for r in dbSession.query(db.Ingredient.name,
db.RecIngred.ingredient_amount).join(db.RecIngred).join(db.Recipe).filter(db.Recipe.recipe_id == key2).all()], missing) db.RecIngred.ingredient_amount).join(db.RecIngred).join(db.Recipe).filter(db.Recipe.recipe_id == key2).all()], missing)
return outDict2
def getRecDict2(indx, inputArr):
'''returns dict with percentage of overlay as keys and recipes as values'''
dbSession = db.Session()
# 2d to 1d
indx = sum(indx.values(), [])
k = Counter(indx)
# keep 1000 most relevant to have consistent query time
indx = k.most_common(1000)
indx = dict(indx)
# get ingredients for all recipes
ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter(
db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()]
ingredDict = getIngredDict(ingred)
ignored = findUnrecognized(dbSession, inputArr)
inputArr += stemInput(defaultArr)
outDict = calcOverlay(inputArr, ingredDict)
ingreds = resolvRecipe(dbSession, outDict)
outDict = {} outDict = {}
outDict["ignored"] = ignored outDict["ignored"] = ignored
outDict["ingred"] = outDict2 outDict["ingred"] = ingreds
return outDict return outDict
def stem(l1):
snowball = nltk.SnowballStemmer(language='german')
stopset = set(stopwords.words('german'))
stopset |= set("(),")
l1 = [snowball.stem(l) for l in l1]
return l1
def calcOverlay2(l1, l2): def calcOverlay2(l1, l2):
'''Calculates overlay and returns missing ingredients, [score (float), missing([])]''' '''Calculates overlay and returns missing ingredients, [score (float), missing([])]'''
counter = 0 counter = 0

View File

@ -1,10 +1,9 @@
from application import app from application import app
from application.search import defaultArr, stem from application.search import defaultArr, stemInput
import nltk import nltk
nltk.download('stopwords') nltk.download('stopwords')
nltk.download('punkt') nltk.download('punkt')
defaultArr = stem(defaultArr)
app.run(host="0.0.0.0", port='5001', debug=False, threaded=True) app.run(host="0.0.0.0", port='5001', debug=False, threaded=True)