From a30e99b7aea982999ff79532a9136cfcb6a2fe14 Mon Sep 17 00:00:00 2001
From: Askill <ofjokg@gmail.com>
Date: Tue, 7 Jul 2020 14:06:13 +0200
Subject: [PATCH] slight refactor of the ranking function

---
 app/application/search.py | 101 ++++++++++++++++++++------------------
 app/run.py                |   3 +-
 2 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/app/application/search.py b/app/application/search.py
index c3c913a..6f4827a 100644
--- a/app/application/search.py
+++ b/app/application/search.py
@@ -21,64 +21,55 @@ def search2(inputArr):
 
     return(indx)
 
-
-def stemInput(inputArr):
+def stemInput(l1):
     ''' returns array of stemmed input '''
-    inputArr2 = []
-
     snowball = nltk.SnowballStemmer(language='german')
     stopset = set(stopwords.words('german'))
-    for word in inputArr:
-        if word in stopset:
-            continue
-        inputArr2.append(snowball.stem(word))
-    return inputArr2
+    stopset |= set("(),")
+    l1 = [snowball.stem(l) for l in l1]
+    return l1
 
-# TODO: split into more functions
-def getRecDict2(indx, inputArr):
-    dbSession = db.Session()
 
-    outDict = {}
-    # 2d to 1d
-    indx = sum(indx.values(), [])
-    k = Counter(indx)
-    indx = k.most_common(1000)
-    indx = dict(indx)
+def findUnrecognized(dbSession, inputArr):
+    ''' check if any input is not in db.Trunk'''
+    ignored = []
+    for x in inputArr:
+        if not dbSession.query(exists().where(db.Trunk.name == x)).scalar():
+            ignored.append(inputArr.index(x))
+    return ignored
 
-    ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter(
-        db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()]
+def getIngredDict(ingred):
+    ''' RezeptID, stemmed Ingred, full ingred Name
+     Dict spiegelt DB wieder, key, full ingred, stemmed
+     this structure makes calcOverlay() more efficient '''
+    
     ingredDict = {}
-
-    # RezeptID, stemmed Ingred, full ingred Name
-    # Dict spiegelt DB wieder, key, full ingred, stemmed
-    # this structure makes calcOverlay() more efficient
     for k, v, i in ingred:
         if k not in ingredDict:
             ingredDict[k] = {}
         if i not in ingredDict[k]:
             ingredDict[k][i] = []
         ingredDict[k][i].append(v)
+    return ingredDict
 
-    # check if any input is not in db.Trunk
-    ignored = []
-    for x in inputArr:
-        if not dbSession.query(exists().where(db.Trunk.name == x)).scalar():
-            ignored.append(inputArr.index(x))
+def calcOverlay(inputArr, ingredDict):
+    '''checks overlay per recipeID 
+    iterate over ingreds and checks per stemmed ingred
+    returns accurate percentage of overlay
+    since overlay scare is the key of dict it is reduced by insignificant number to preserve all values'''
 
-    inputArr += defaultArr
-
-    # checks overlay per recipeID 
-    # iterate over ingreds and checks per stemmed ingred
-    # returns accurate percentage of overlay
-    # since overlay scare is the key of dict it is reduced by insignificant number to preserve all values  
+    outDict = {}
     for key, value in ingredDict.items():
         overlay, missing = calcOverlay2(inputArr, value)
         while overlay in outDict.keys():
             overlay -= 0.0001
         outDict[overlay] = (int(key), missing)
-    
-    # return Dict with 20 highest value keys
-    # creates dict which is returned
+    return outDict
+
+def resolvRecipe(dbSession, outDict):
+    ''' return Dict with 20 highest value keys
+     creates dict which is returned'''
+
     outDict2 = {}
     for key in heapq.nlargest(20, outDict.keys()):
         key2 = outDict[key][0]
@@ -87,20 +78,34 @@ def getRecDict2(indx, inputArr):
             db.Recipe.recipe_id == key2).first()
         outDict2[key] = (key2, rec.name, rec.url,  [r[0] + ": " + r[1] for r in dbSession.query(db.Ingredient.name,
                                                                                                 db.RecIngred.ingredient_amount).join(db.RecIngred).join(db.Recipe).filter(db.Recipe.recipe_id == key2).all()], missing)
+    return outDict2
+
+def getRecDict2(indx, inputArr):
+    '''returns dict with percentage of overlay as keys and recipes as values'''
+    dbSession = db.Session()
+   
+    # 2d to 1d
+    indx = sum(indx.values(), [])
+    k = Counter(indx)
+    # keep 1000 most relevant to have consistent query time
+    indx = k.most_common(1000)
+    indx = dict(indx)
+
+    # get ingredients for all recipes
+    ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter(
+        db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()]
+    
+    ingredDict = getIngredDict(ingred)
+    ignored = findUnrecognized(dbSession, inputArr)
+    inputArr += stemInput(defaultArr)
+    outDict = calcOverlay(inputArr, ingredDict)
+    ingreds = resolvRecipe(dbSession, outDict)
+
     outDict = {}
     outDict["ignored"] = ignored
-    outDict["ingred"] = outDict2
+    outDict["ingred"] = ingreds
     return outDict
 
-
-def stem(l1):
-    snowball = nltk.SnowballStemmer(language='german')
-    stopset = set(stopwords.words('german'))
-    stopset |= set("(),")
-    l1 = [snowball.stem(l) for l in l1]
-    return l1
-
-
 def calcOverlay2(l1, l2):
     '''Calculates overlay and returns missing ingredients, [score (float), missing([])]'''
     counter = 0
diff --git a/app/run.py b/app/run.py
index 3ee7efe..1b4832a 100644
--- a/app/run.py
+++ b/app/run.py
@@ -1,10 +1,9 @@
 from application import app
-from application.search import defaultArr, stem
+from application.search import defaultArr, stemInput
 import nltk
 
 nltk.download('stopwords')
 nltk.download('punkt')
-defaultArr = stem(defaultArr)
 app.run(host="0.0.0.0", port='5001', debug=False, threaded=True)