sorted by overlap works, much simpler db model

2020-04-15 14:16:28 +02:00 · 2020-04-15 14:16:28 +02:00 · 853b749fde
parent ffc41d617b
commit 853b749fde
5 changed files with 90 additions and 77 deletions
--- a/application/pycache/db.cpython-37.pyc
+++ b/application/pycache/db.cpython-37.pyc
--- a/application/db.py
+++ b/application/db.py
@ -13,18 +13,7 @@ Base = declarative_base()
 Session = sessionmaker(bind=engine)
 # https://docs.sqlalchemy.org/en/13/orm/basic_relationships.html#association-object
 class Link(Base):
    __tablename__ = 'link'
    recipe_id = Column(Integer, ForeignKey('recipe.recipe_id'), primary_key=True)
    ingredient_id = Column(Integer, ForeignKey('ingredient.ingredient_id'), primary_key=True)
    ingredient_amount = Column('ingredient_amount', Text)
    ingredient_amount_mu = Column('ingredient_amount_mu', Text)    # measurement unit
    recipe = relationship("Recipe", back_populates="ingredient")
    ingredient = relationship("Ingredient", back_populates="recipe")
    def ingredients(self):
        return self.ingredient.name
 class Recipe(Base):
    __tablename__ = "recipe"
@ -33,7 +22,8 @@ class Recipe(Base):
    instructions = Column('instructions', Text)
    url = Column('url', Text)
    img = Column('img', LargeBinary)
-    ingredient = relationship("Link", back_populates="recipe")
+    ingredient = relationship("Ingredient", backref="recipe")
    trunk = relationship("Trunk", backref="recipe")
    def ingredients(self):
        l = []
@ -69,14 +59,18 @@ class Ingredient(Base):
    __tablename__ = "ingredient"
    ingredient_id = Column('ingredient_id', Integer,  primary_key=True, autoincrement=True)
    name = Column('name', Text)
-    recipe = relationship("Link", back_populates="ingredient")
+    ingredient_amount = Column('ingredient_amount', Text)
-    trunks = relationship("Trunk")
+    ingredient_amount_mu = Column('ingredient_amount_mu', Text)    # measurement unit
    recipe_id = Column(Integer, ForeignKey('recipe.recipe_id'))
 class Trunk(Base):
    __tablename__ = "trunk"
    trunk_id = Column('trunk_id', Integer,  primary_key=True, autoincrement=True)
    name = Column('name', Text)
-    ingredient_id = Column(Integer, ForeignKey('ingredient.ingredient_id'))
+
    recipe_id = Column(Integer, ForeignKey('recipe.recipe_id'))
 Base.metadata.create_all(engine)
--- a/migrate.py
+++ b/migrate.py
@ -1,7 +1,21 @@
 import json
 import cv2
 import base64
-from application.db import Session, Recipe, Ingredient, Link, Trunk
+import nltk as nltk
 from nltk.corpus import stopwords
 from application.db import Session, Recipe, Ingredient, Trunk
 def stemWord(word):
    arr = []
    stopset = set(stopwords.words('german'))
    stopset |= set("(),")
    snowball = nltk.SnowballStemmer(language='german')
    for token in nltk.word_tokenize(word): 
        if token in stopset or len(token) < 4:
            continue
        stemmed = snowball.stem(token)
        arr.append(stemmed)
    return arr
 def migrate(path):
    recs = ""
@ -9,6 +23,8 @@ def migrate(path):
        recs = json.load(file)
    dbSession = Session()
    counter = 0
    leng = len(recs)
    for key, value in recs.items():
        name=key
        resString=value[0]
@ -16,11 +32,16 @@ def migrate(path):
        img=value[3].encode()
        r = Recipe(name=name, instructions=resString, url=link, img=img)
        for x, y in value[1].items():
-            a = Link(ingredient_amount=y)
+            a = Ingredient(name=x, ingredient_amount=y)
            a.ingredient = Ingredient(name=x)
            r.ingredient.append(a)
            for x in stemWord(a.name):
                t = Trunk(name=x)
                r.trunk.append(t)
        dbSession.add(r)
        dbSession.commit()
        counter+=1
        print(counter/leng)
 migrate('./data/recs.json')
--- a/mine.py
+++ b/mine.py
@ -8,7 +8,7 @@ import random
 import traceback
 import cv2
 import base64
-from application.db import Session, Recipe, Ingredient, Link, Trunk
+from application.db import Session, Recipe, Ingredient, Trunk
 import nltk as nltk
 from nltk.corpus import stopwords
@ -126,6 +126,8 @@ def getRecipe(links):
            sleep(random.randint(0, 5))
    return recs
 def stemIngred():
    dbSession = Session()
    stopset = set(stopwords.words('german'))
@ -135,7 +137,7 @@ def stemIngred():
    for x in dbSession.query(Ingredient).all():
        snowball = nltk.SnowballStemmer(language='german')
        for token in nltk.word_tokenize(x.name): 
-            if token in stopset or len(token) < 3:
+            if token in stopset or len(token) < 4:
                continue
            stemmed = snowball.stem(token)
@ -153,7 +155,7 @@ with open('./data/links.json') as file:
 #recs = getRecipe(links)
-stemIngred()
+#stemIngred()
 #with open('./data/recs.json', 'w', encoding="utf-8") as file:
 #    json.dump(recs, file, ensure_ascii=False)
--- a/test.py
+++ b/test.py
@ -1,12 +1,12 @@
-from application.db import Session, Recipe, Ingredient, Link, Trunk
+from application.db import Session, Recipe, Ingredient, Trunk
 import nltk as nltk
 from nltk.corpus import stopwords
 import time
 dbSession = Session()
-inputArr = ["butter", "milch", "eier", "käse"] 
+inputArr = ["butter", "milch", "eier", "mehl", "zucker"] 
-maxMissing = 4
+maxMissing = 10
 def slow():
    recipes = dbSession.query(Recipe).all()
@ -36,32 +36,36 @@ def slow():
 #        for xx in x:
 #            print(xx)
-def faster():
+def faster(inputArr):
    indx = {}
    for inpu in inputArr:
        ids = [] 
-        for x in dbSession.query(Ingredient).filter(Ingredient.name.contains(inpu)).all():
+        for x in dbSession.query(Trunk.recipe_id).filter(Trunk.name.contains(inpu)).all():
            if str(x[0]) not in indx:
                indx[str(x[0])] = 0
-            for y in x.recipe:
+            indx[str(x[0])] += 1
                if dbSession.query(Link).filter(Link.recipe_id==y.recipe_id).count() > len(inputArr) + maxMissing:
                    continue   
                if str(y.recipe_id) not in indx:
                    indx[str(y.recipe_id)] = 0
                indx[str(y.recipe_id)] += 1
-    outDict = {}
+    return(indx)
    for key, value in indx.items():
        ingred = dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().ingredients()
        outDict[calcOverlay(inputArr, ingred)] = (dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().name, key, ingred)
    print(outDict)
-
+def fastes(inputArr):
 def fastes():
    indx = {}
    for inpu in inputArr:
        ids = [] 
        for recipe_id in dbSession.query(Trunk.recipe_id).filter(Trunk.name == inpu).all():
            if str(recipe_id[0]) not in indx:
                indx[str(recipe_id[0])] = 0
            indx[str(recipe_id[0])] += 1
    return(indx) 
 def stemInput(inputArr):
    inputArr2 = []
    snowball = nltk.SnowballStemmer(language='german')
@ -72,26 +76,18 @@ def fastes():
             continue
        inputArr2.append(snowball.stem(word))
-    for inpu in inputArr2:
+    return inputArr2
        ids = [] 
        for xx in dbSession.query(Trunk).filter(Trunk.name == inpu).all():
            for x in dbSession.query(Ingredient).filter(xx.ingredient_id == Ingredient.ingredient_id).all():
                for y in x.recipe:
                    if dbSession.query(Link).filter(Link.recipe_id==y.recipe_id).count() > len(inputArr) + maxMissing:
                        continue   
                    if str(y.recipe_id) not in indx:
                        indx[str(y.recipe_id)] = 0
                    indx[str(y.recipe_id)] += 1
    outDict = {}
    for key, value in indx.items():
        ingred = dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().ingredients()
        outDict[calcOverlay(inputArr, ingred)] = (dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().name, key, ingred)
    print(outDict)
 #
 def printDict(indx):
    outDict = {}
    for key, value in sorted(indx.items()):
        ingred = dbSession.query(Trunk.name).filter(Trunk.recipe_id==int(key)).all()
        outDict[calcOverlay(inputArr, ingred)] = (dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().name, key, dbSession.query(Ingredient.name).filter(Ingredient.recipe_id==key).all())
    for key, value in outDict.items():
        if key >= 0.5:
            print(key, value)
 def calcOverlay(l1, l2):
    snowball = nltk.SnowballStemmer(language='german')
@ -102,31 +98,31 @@ def calcOverlay(l1, l2):
    counter = 0
    for x in l2:
-        for token in nltk.word_tokenize(x): 
+        for l in l1:
-            if token in stopset:
+            if l == x[0]:
-                continue
+                #print(l)
-            stemmed = snowball.stem(token)
+                counter +=1
-            for l in l1:
+    counter = counter / len(l2)                 
                if l == stemmed:
                    counter +=1
    return counter
 #
 start = time.time()
-slow()
+#slow()
 end = time.time()
 print("\n", end - start, "\n")  
 stemmed = stemInput(inputArr)
 start = time.time()
 indx = faster(stemmed)  
 end = time.time()
 printDict(indx)
 print("\n", end - start, "\n")  
 start = time.time()
-faster()  
+indx = fastes(stemmed)
 end = time.time()
 print("\n", end - start, "\n")  
 start = time.time()
 fastes()
 end = time.time()
 printDict(indx)
 print("\n", end - start, "\n")