Inverse-Rezeptsuche/app/application/search.py


import application.db2 as db
from flask import g
import nltk as nltk
from nltk.corpus import stopwords
import time
import heapq
from collections import Counter
import background.migrate
from sqlalchemy import exists

def search2(inputArr):
    ''' returns inputs with array of recipeID which use them  '''
    indx = {}
    dbSession = db.Session()
    for inpu in inputArr:
        x = dbSession.query(db.Trunk.name, db.Recipe.recipe_id).filter(db.Trunk.name == inpu).join(db.IngredTrunk).join(
            db.Ingredient).join(db.RecIngred).join(db.Recipe).all()

        indx[inpu] = [str(y[1]) for y in x]

    return(indx)

def stemInput(l1):
    ''' returns array of stemmed input '''
    snowball = nltk.SnowballStemmer(language='german')
    stopset = set(stopwords.words('german'))
    stopset |= set("(),")
    l1 = [snowball.stem(l) for l in l1]
    return l1


def findUnrecognized(dbSession, inputArr):
    ''' check if any input is not in db.Trunk'''
    ignored = []
    for x in inputArr:
        if not dbSession.query(exists().where(db.Trunk.name == x)).scalar():
            ignored.append(inputArr.index(x))
    return ignored

def getIngredDict(ingred):
    ''' RezeptID, stemmed Ingred, full ingred Name
     Dict spiegelt DB wieder, key, full ingred, stemmed
     this structure makes calcOverlay() more efficient '''

    ingredDict = {}
    for k, v, i in ingred:
        if k not in ingredDict:
            ingredDict[k] = {}
        if i not in ingredDict[k]:
            ingredDict[k][i] = []
        ingredDict[k][i].append(v)
    return ingredDict

def calcOverlay(inputArr, ingredDict):
    '''checks overlay per recipeID
    iterate over ingreds and checks per stemmed ingred
    returns accurate percentage of overlay
    since overlay scare is the key of dict it is reduced by insignificant number to preserve all values'''

    outDict = {}
    for key, value in ingredDict.items():
        overlay, missing = calcOverlay2(inputArr, value)
        while overlay in outDict.keys():
            overlay -= 0.0001
        outDict[overlay] = (int(key), missing)
    return outDict

def resolvRecipe(dbSession, outDict):
    ''' return Dict with 20 highest value keys
     creates dict which is returned'''

    outDict2 = {}
    for key in heapq.nlargest(20, outDict.keys()):
        key2 = outDict[key][0]
        missing = outDict[key][1]
        rec = dbSession.query(db.Recipe).filter(
            db.Recipe.recipe_id == key2).first()
        outDict2[key] = (key2, rec.name, rec.url,  [r[0] + ": " + r[1] for r in dbSession.query(db.Ingredient.name,
                                                                                                db.RecIngred.ingredient_amount).join(db.RecIngred).join(db.Recipe).filter(db.Recipe.recipe_id == key2).all()], missing)
    return outDict2

def getRecDict2(indx, inputArr):
    '''returns dict with percentage of overlay as keys and recipes as values'''
    dbSession = db.Session()

    # 2d to 1d
    indx = sum(indx.values(), [])
    k = Counter(indx)
    # keep 1000 most relevant to have consistent query time
    indx = k.most_common(1000)
    indx = dict(indx)

    # get ingredients for all recipes
    ingred = [x for x in dbSession.query(db.Recipe.recipe_id, db.IngredTrunk.trunk_name, db.IngredTrunk.ingredient_name).filter(
        db.Recipe.recipe_id.in_(indx.keys())).join(db.RecIngred).join(db.Ingredient).join(db.IngredTrunk).all()]

    ingredDict = getIngredDict(ingred)
    ignored = findUnrecognized(dbSession, inputArr)
    inputArr += stemInput(defaultArr)
    outDict = calcOverlay(inputArr, ingredDict)
    ingreds = resolvRecipe(dbSession, outDict)

    outDict = {}
    outDict["ignored"] = ignored
    outDict["ingred"] = ingreds
    return outDict

def calcOverlay2(l1, l2):
    '''Calculates overlay and returns missing ingredients, [score (float), missing([])]'''
    counter = 0
    notIn = []
    for key, ll in l2.items():
        missing = True
        for l in ll:
            if l in l1:
                counter += 1
                missing = False
                break
        if missing:
            notIn.append(key)

    counter = counter / len(l2)
    return counter, notIn


# it is assumed that everyone has this
defaultArr = ["wasser", "salz", "pfeffer"]