slight refactor of the ranking function
This commit is contained in:
parent
7067130d36
commit
a30e99b7ae
|
|
@ -21,64 +21,55 @@ def search2(inputArr):
|
||||||
|
|
||||||
return(indx)
|
return(indx)
|
||||||
|
|
||||||
|
def stemInput(l1):
|
||||||
def stemInput(inputArr):
|
|
||||||
''' returns array of stemmed input '''
|
''' returns array of stemmed input '''
|
||||||
inputArr2 = []
|
|
||||||
|
|
||||||
snowball = nltk.SnowballStemmer(language='german')
|
snowball = nltk.SnowballStemmer(language='german')
|
||||||
stopset = set(stopwords.words('german'))
|
stopset = set(stopwords.words('german'))
|
||||||
for word in inputArr:
|
stopset |= set("(),")
|
||||||
if word in stopset:
|
l1 = [snowball.stem(l) for l in l1]
|
||||||
continue
|
return l1
|
||||||
inputArr2.append(snowball.stem(word))
|
|
||||||
return inputArr2
|
|
||||||
|
|
||||||
# TODO: split into more functions
|
|
||||||
def getRecDict2(indx, inputArr):
|
|
||||||
dbSession = db.Session()
|
|
||||||
|
|
||||||
outDict = {}
|
def findUnrecognized(dbSession, inputArr):
|
||||||
# 2d to 1d
|
''' check if any input is not in db.Trunk'''
|
||||||
indx = sum(indx.values(), [])
|
ignored = []
|
||||||
k = Counter(indx)
|
for x in inputArr:
|
||||||
indx = k.most_common(1000)
|
if not dbSession.query(exists().where(db.Trunk.name == x)).scalar():
|
||||||
indx = dict(indx)
|
ignored.append(inputArr.index(x))
|
||||||
|
return ignored
|
||||||
|
|
||||||
ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter(
|
def getIngredDict(ingred):
|
||||||
db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()]
|
''' RezeptID, stemmed Ingred, full ingred Name
|
||||||
|
Dict spiegelt DB wieder, key, full ingred, stemmed
|
||||||
|
this structure makes calcOverlay() more efficient '''
|
||||||
|
|
||||||
ingredDict = {}
|
ingredDict = {}
|
||||||
|
|
||||||
# RezeptID, stemmed Ingred, full ingred Name
|
|
||||||
# Dict spiegelt DB wieder, key, full ingred, stemmed
|
|
||||||
# this structure makes calcOverlay() more efficient
|
|
||||||
for k, v, i in ingred:
|
for k, v, i in ingred:
|
||||||
if k not in ingredDict:
|
if k not in ingredDict:
|
||||||
ingredDict[k] = {}
|
ingredDict[k] = {}
|
||||||
if i not in ingredDict[k]:
|
if i not in ingredDict[k]:
|
||||||
ingredDict[k][i] = []
|
ingredDict[k][i] = []
|
||||||
ingredDict[k][i].append(v)
|
ingredDict[k][i].append(v)
|
||||||
|
return ingredDict
|
||||||
|
|
||||||
# check if any input is not in db.Trunk
|
def calcOverlay(inputArr, ingredDict):
|
||||||
ignored = []
|
'''checks overlay per recipeID
|
||||||
for x in inputArr:
|
iterate over ingreds and checks per stemmed ingred
|
||||||
if not dbSession.query(exists().where(db.Trunk.name == x)).scalar():
|
returns accurate percentage of overlay
|
||||||
ignored.append(inputArr.index(x))
|
since overlay scare is the key of dict it is reduced by insignificant number to preserve all values'''
|
||||||
|
|
||||||
inputArr += defaultArr
|
outDict = {}
|
||||||
|
|
||||||
# checks overlay per recipeID
|
|
||||||
# iterate over ingreds and checks per stemmed ingred
|
|
||||||
# returns accurate percentage of overlay
|
|
||||||
# since overlay scare is the key of dict it is reduced by insignificant number to preserve all values
|
|
||||||
for key, value in ingredDict.items():
|
for key, value in ingredDict.items():
|
||||||
overlay, missing = calcOverlay2(inputArr, value)
|
overlay, missing = calcOverlay2(inputArr, value)
|
||||||
while overlay in outDict.keys():
|
while overlay in outDict.keys():
|
||||||
overlay -= 0.0001
|
overlay -= 0.0001
|
||||||
outDict[overlay] = (int(key), missing)
|
outDict[overlay] = (int(key), missing)
|
||||||
|
return outDict
|
||||||
# return Dict with 20 highest value keys
|
|
||||||
# creates dict which is returned
|
def resolvRecipe(dbSession, outDict):
|
||||||
|
''' return Dict with 20 highest value keys
|
||||||
|
creates dict which is returned'''
|
||||||
|
|
||||||
outDict2 = {}
|
outDict2 = {}
|
||||||
for key in heapq.nlargest(20, outDict.keys()):
|
for key in heapq.nlargest(20, outDict.keys()):
|
||||||
key2 = outDict[key][0]
|
key2 = outDict[key][0]
|
||||||
|
|
@ -87,20 +78,34 @@ def getRecDict2(indx, inputArr):
|
||||||
db.Recipe.recipe_id == key2).first()
|
db.Recipe.recipe_id == key2).first()
|
||||||
outDict2[key] = (key2, rec.name, rec.url, [r[0] + ": " + r[1] for r in dbSession.query(db.Ingredient.name,
|
outDict2[key] = (key2, rec.name, rec.url, [r[0] + ": " + r[1] for r in dbSession.query(db.Ingredient.name,
|
||||||
db.RecIngred.ingredient_amount).join(db.RecIngred).join(db.Recipe).filter(db.Recipe.recipe_id == key2).all()], missing)
|
db.RecIngred.ingredient_amount).join(db.RecIngred).join(db.Recipe).filter(db.Recipe.recipe_id == key2).all()], missing)
|
||||||
|
return outDict2
|
||||||
|
|
||||||
|
def getRecDict2(indx, inputArr):
|
||||||
|
'''returns dict with percentage of overlay as keys and recipes as values'''
|
||||||
|
dbSession = db.Session()
|
||||||
|
|
||||||
|
# 2d to 1d
|
||||||
|
indx = sum(indx.values(), [])
|
||||||
|
k = Counter(indx)
|
||||||
|
# keep 1000 most relevant to have consistent query time
|
||||||
|
indx = k.most_common(1000)
|
||||||
|
indx = dict(indx)
|
||||||
|
|
||||||
|
# get ingredients for all recipes
|
||||||
|
ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter(
|
||||||
|
db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()]
|
||||||
|
|
||||||
|
ingredDict = getIngredDict(ingred)
|
||||||
|
ignored = findUnrecognized(dbSession, inputArr)
|
||||||
|
inputArr += stemInput(defaultArr)
|
||||||
|
outDict = calcOverlay(inputArr, ingredDict)
|
||||||
|
ingreds = resolvRecipe(dbSession, outDict)
|
||||||
|
|
||||||
outDict = {}
|
outDict = {}
|
||||||
outDict["ignored"] = ignored
|
outDict["ignored"] = ignored
|
||||||
outDict["ingred"] = outDict2
|
outDict["ingred"] = ingreds
|
||||||
return outDict
|
return outDict
|
||||||
|
|
||||||
|
|
||||||
def stem(l1):
|
|
||||||
snowball = nltk.SnowballStemmer(language='german')
|
|
||||||
stopset = set(stopwords.words('german'))
|
|
||||||
stopset |= set("(),")
|
|
||||||
l1 = [snowball.stem(l) for l in l1]
|
|
||||||
return l1
|
|
||||||
|
|
||||||
|
|
||||||
def calcOverlay2(l1, l2):
|
def calcOverlay2(l1, l2):
|
||||||
'''Calculates overlay and returns missing ingredients, [score (float), missing([])]'''
|
'''Calculates overlay and returns missing ingredients, [score (float), missing([])]'''
|
||||||
counter = 0
|
counter = 0
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,9 @@
|
||||||
from application import app
|
from application import app
|
||||||
from application.search import defaultArr, stem
|
from application.search import defaultArr, stemInput
|
||||||
import nltk
|
import nltk
|
||||||
|
|
||||||
nltk.download('stopwords')
|
nltk.download('stopwords')
|
||||||
nltk.download('punkt')
|
nltk.download('punkt')
|
||||||
defaultArr = stem(defaultArr)
|
|
||||||
app.run(host="0.0.0.0", port='5001', debug=False, threaded=True)
|
app.run(host="0.0.0.0", port='5001', debug=False, threaded=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue