From a30e99b7aea982999ff79532a9136cfcb6a2fe14 Mon Sep 17 00:00:00 2001 From: Askill Date: Tue, 7 Jul 2020 14:06:13 +0200 Subject: [PATCH] slight refactor of the ranking function --- app/application/search.py | 101 ++++++++++++++++++++------------------ app/run.py | 3 +- 2 files changed, 54 insertions(+), 50 deletions(-) diff --git a/app/application/search.py b/app/application/search.py index c3c913a..6f4827a 100644 --- a/app/application/search.py +++ b/app/application/search.py @@ -21,64 +21,55 @@ def search2(inputArr): return(indx) - -def stemInput(inputArr): +def stemInput(l1): ''' returns array of stemmed input ''' - inputArr2 = [] - snowball = nltk.SnowballStemmer(language='german') stopset = set(stopwords.words('german')) - for word in inputArr: - if word in stopset: - continue - inputArr2.append(snowball.stem(word)) - return inputArr2 + stopset |= set("(),") + l1 = [snowball.stem(l) for l in l1] + return l1 -# TODO: split into more functions -def getRecDict2(indx, inputArr): - dbSession = db.Session() - outDict = {} - # 2d to 1d - indx = sum(indx.values(), []) - k = Counter(indx) - indx = k.most_common(1000) - indx = dict(indx) +def findUnrecognized(dbSession, inputArr): + ''' check if any input is not in db.Trunk''' + ignored = [] + for x in inputArr: + if not dbSession.query(exists().where(db.Trunk.name == x)).scalar(): + ignored.append(inputArr.index(x)) + return ignored - ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter( - db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()] +def getIngredDict(ingred): + ''' RezeptID, stemmed Ingred, full ingred Name + Dict spiegelt DB wieder, key, full ingred, stemmed + this structure makes calcOverlay() more efficient ''' + ingredDict = {} - - # RezeptID, stemmed Ingred, full ingred Name - # Dict spiegelt DB wieder, key, full ingred, stemmed - # this structure makes calcOverlay() more efficient for k, v, i in ingred: if k not in ingredDict: ingredDict[k] = {} if i not in ingredDict[k]: ingredDict[k][i] = [] ingredDict[k][i].append(v) + return ingredDict - # check if any input is not in db.Trunk - ignored = [] - for x in inputArr: - if not dbSession.query(exists().where(db.Trunk.name == x)).scalar(): - ignored.append(inputArr.index(x)) +def calcOverlay(inputArr, ingredDict): + '''checks overlay per recipeID + iterate over ingreds and checks per stemmed ingred + returns accurate percentage of overlay + since overlay scare is the key of dict it is reduced by insignificant number to preserve all values''' - inputArr += defaultArr - - # checks overlay per recipeID - # iterate over ingreds and checks per stemmed ingred - # returns accurate percentage of overlay - # since overlay scare is the key of dict it is reduced by insignificant number to preserve all values + outDict = {} for key, value in ingredDict.items(): overlay, missing = calcOverlay2(inputArr, value) while overlay in outDict.keys(): overlay -= 0.0001 outDict[overlay] = (int(key), missing) - - # return Dict with 20 highest value keys - # creates dict which is returned + return outDict + +def resolvRecipe(dbSession, outDict): + ''' return Dict with 20 highest value keys + creates dict which is returned''' + outDict2 = {} for key in heapq.nlargest(20, outDict.keys()): key2 = outDict[key][0] @@ -87,20 +78,34 @@ def getRecDict2(indx, inputArr): db.Recipe.recipe_id == key2).first() outDict2[key] = (key2, rec.name, rec.url, [r[0] + ": " + r[1] for r in dbSession.query(db.Ingredient.name, db.RecIngred.ingredient_amount).join(db.RecIngred).join(db.Recipe).filter(db.Recipe.recipe_id == key2).all()], missing) + return outDict2 + +def getRecDict2(indx, inputArr): + '''returns dict with percentage of overlay as keys and recipes as values''' + dbSession = db.Session() + + # 2d to 1d + indx = sum(indx.values(), []) + k = Counter(indx) + # keep 1000 most relevant to have consistent query time + indx = k.most_common(1000) + indx = dict(indx) + + # get ingredients for all recipes + ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter( + db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()] + + ingredDict = getIngredDict(ingred) + ignored = findUnrecognized(dbSession, inputArr) + inputArr += stemInput(defaultArr) + outDict = calcOverlay(inputArr, ingredDict) + ingreds = resolvRecipe(dbSession, outDict) + outDict = {} outDict["ignored"] = ignored - outDict["ingred"] = outDict2 + outDict["ingred"] = ingreds return outDict - -def stem(l1): - snowball = nltk.SnowballStemmer(language='german') - stopset = set(stopwords.words('german')) - stopset |= set("(),") - l1 = [snowball.stem(l) for l in l1] - return l1 - - def calcOverlay2(l1, l2): '''Calculates overlay and returns missing ingredients, [score (float), missing([])]''' counter = 0 diff --git a/app/run.py b/app/run.py index 3ee7efe..1b4832a 100644 --- a/app/run.py +++ b/app/run.py @@ -1,10 +1,9 @@ from application import app -from application.search import defaultArr, stem +from application.search import defaultArr, stemInput import nltk nltk.download('stopwords') nltk.download('punkt') -defaultArr = stem(defaultArr) app.run(host="0.0.0.0", port='5001', debug=False, threaded=True)