slight refactor of the ranking function

This commit is contained in:
Askill 2020-07-07 14:06:13 +02:00
parent 7067130d36
commit a30e99b7ae
2 changed files with 54 additions and 50 deletions

View File

@ -21,64 +21,55 @@ def search2(inputArr):
return(indx)
def stemInput(inputArr):
def stemInput(l1):
''' returns array of stemmed input '''
inputArr2 = []
snowball = nltk.SnowballStemmer(language='german')
stopset = set(stopwords.words('german'))
for word in inputArr:
if word in stopset:
continue
inputArr2.append(snowball.stem(word))
return inputArr2
stopset |= set("(),")
l1 = [snowball.stem(l) for l in l1]
return l1
# TODO: split into more functions
def getRecDict2(indx, inputArr):
dbSession = db.Session()
outDict = {}
# 2d to 1d
indx = sum(indx.values(), [])
k = Counter(indx)
indx = k.most_common(1000)
indx = dict(indx)
def findUnrecognized(dbSession, inputArr):
''' check if any input is not in db.Trunk'''
ignored = []
for x in inputArr:
if not dbSession.query(exists().where(db.Trunk.name == x)).scalar():
ignored.append(inputArr.index(x))
return ignored
def getIngredDict(ingred):
''' RezeptID, stemmed Ingred, full ingred Name
Dict spiegelt DB wieder, key, full ingred, stemmed
this structure makes calcOverlay() more efficient '''
ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter(
db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()]
ingredDict = {}
# RezeptID, stemmed Ingred, full ingred Name
# Dict spiegelt DB wieder, key, full ingred, stemmed
# this structure makes calcOverlay() more efficient
for k, v, i in ingred:
if k not in ingredDict:
ingredDict[k] = {}
if i not in ingredDict[k]:
ingredDict[k][i] = []
ingredDict[k][i].append(v)
return ingredDict
# check if any input is not in db.Trunk
ignored = []
for x in inputArr:
if not dbSession.query(exists().where(db.Trunk.name == x)).scalar():
ignored.append(inputArr.index(x))
def calcOverlay(inputArr, ingredDict):
'''checks overlay per recipeID
iterate over ingreds and checks per stemmed ingred
returns accurate percentage of overlay
since overlay scare is the key of dict it is reduced by insignificant number to preserve all values'''
inputArr += defaultArr
# checks overlay per recipeID
# iterate over ingreds and checks per stemmed ingred
# returns accurate percentage of overlay
# since overlay scare is the key of dict it is reduced by insignificant number to preserve all values
outDict = {}
for key, value in ingredDict.items():
overlay, missing = calcOverlay2(inputArr, value)
while overlay in outDict.keys():
overlay -= 0.0001
outDict[overlay] = (int(key), missing)
return outDict
def resolvRecipe(dbSession, outDict):
''' return Dict with 20 highest value keys
creates dict which is returned'''
# return Dict with 20 highest value keys
# creates dict which is returned
outDict2 = {}
for key in heapq.nlargest(20, outDict.keys()):
key2 = outDict[key][0]
@ -87,20 +78,34 @@ def getRecDict2(indx, inputArr):
db.Recipe.recipe_id == key2).first()
outDict2[key] = (key2, rec.name, rec.url, [r[0] + ": " + r[1] for r in dbSession.query(db.Ingredient.name,
db.RecIngred.ingredient_amount).join(db.RecIngred).join(db.Recipe).filter(db.Recipe.recipe_id == key2).all()], missing)
return outDict2
def getRecDict2(indx, inputArr):
'''returns dict with percentage of overlay as keys and recipes as values'''
dbSession = db.Session()
# 2d to 1d
indx = sum(indx.values(), [])
k = Counter(indx)
# keep 1000 most relevant to have consistent query time
indx = k.most_common(1000)
indx = dict(indx)
# get ingredients for all recipes
ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter(
db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()]
ingredDict = getIngredDict(ingred)
ignored = findUnrecognized(dbSession, inputArr)
inputArr += stemInput(defaultArr)
outDict = calcOverlay(inputArr, ingredDict)
ingreds = resolvRecipe(dbSession, outDict)
outDict = {}
outDict["ignored"] = ignored
outDict["ingred"] = outDict2
outDict["ingred"] = ingreds
return outDict
def stem(l1):
snowball = nltk.SnowballStemmer(language='german')
stopset = set(stopwords.words('german'))
stopset |= set("(),")
l1 = [snowball.stem(l) for l in l1]
return l1
def calcOverlay2(l1, l2):
'''Calculates overlay and returns missing ingredients, [score (float), missing([])]'''
counter = 0

View File

@ -1,10 +1,9 @@
from application import app
from application.search import defaultArr, stem
from application.search import defaultArr, stemInput
import nltk
nltk.download('stopwords')
nltk.download('punkt')
defaultArr = stem(defaultArr)
app.run(host="0.0.0.0", port='5001', debug=False, threaded=True)