sorted by overlap works, much simpler db model

2020-04-15 14:16:28 +02:00 · 2020-04-15 14:16:28 +02:00 · 853b749fde
parent ffc41d617b
commit 853b749fde
5 changed files with 90 additions and 77 deletions
--- a/application/pycache/db.cpython-37.pyc
+++ b/application/pycache/db.cpython-37.pyc
--- a/application/db.py
+++ b/application/db.py
@ -13,18 +13,7 @@ Base = declarative_base()
 Session = sessionmaker(bind=engine)

 # https://docs.sqlalchemy.org/en/13/orm/basic_relationships.html#association-object
-class Link(Base):
-    __tablename__ = 'link'
-    recipe_id = Column(Integer, ForeignKey('recipe.recipe_id'), primary_key=True)
-    ingredient_id = Column(Integer, ForeignKey('ingredient.ingredient_id'), primary_key=True)
-    ingredient_amount = Column('ingredient_amount', Text)
-    ingredient_amount_mu = Column('ingredient_amount_mu', Text)    # measurement unit

-    recipe = relationship("Recipe", back_populates="ingredient")
-    ingredient = relationship("Ingredient", back_populates="recipe")
-
-    def ingredients(self):
-        return self.ingredient.name

 class Recipe(Base):
    __tablename__ = "recipe"
@ -33,7 +22,8 @@ class Recipe(Base):
    instructions = Column('instructions', Text)
    url = Column('url', Text)
    img = Column('img', LargeBinary)
-    ingredient = relationship("Link", back_populates="recipe")
+    ingredient = relationship("Ingredient", backref="recipe")
+    trunk = relationship("Trunk", backref="recipe")

    def ingredients(self):
        l = []
@ -69,14 +59,18 @@ class Ingredient(Base):
    __tablename__ = "ingredient"
    ingredient_id = Column('ingredient_id', Integer,  primary_key=True, autoincrement=True)
    name = Column('name', Text)
-    recipe = relationship("Link", back_populates="ingredient")
-    trunks = relationship("Trunk")
+    ingredient_amount = Column('ingredient_amount', Text)
+    ingredient_amount_mu = Column('ingredient_amount_mu', Text)    # measurement unit
+
+    recipe_id = Column(Integer, ForeignKey('recipe.recipe_id'))
+    

 class Trunk(Base):
    __tablename__ = "trunk"
    trunk_id = Column('trunk_id', Integer,  primary_key=True, autoincrement=True)
    name = Column('name', Text)
-    ingredient_id = Column(Integer, ForeignKey('ingredient.ingredient_id'))
+
+    recipe_id = Column(Integer, ForeignKey('recipe.recipe_id'))


 Base.metadata.create_all(engine)
--- a/migrate.py
+++ b/migrate.py
@ -1,7 +1,21 @@
 import json
 import cv2
 import base64
-from application.db import Session, Recipe, Ingredient, Link, Trunk
+import nltk as nltk
+from nltk.corpus import stopwords
+from application.db import Session, Recipe, Ingredient, Trunk
+
+def stemWord(word):
+    arr = []
+    stopset = set(stopwords.words('german'))
+    stopset |= set("(),")
+    snowball = nltk.SnowballStemmer(language='german')
+    for token in nltk.word_tokenize(word): 
+        if token in stopset or len(token) < 4:
+            continue
+        stemmed = snowball.stem(token)
+        arr.append(stemmed)
+    return arr

 def migrate(path):
    recs = ""
@ -9,6 +23,8 @@ def migrate(path):
        recs = json.load(file)
    
    dbSession = Session()
+    counter = 0
+    leng = len(recs)
    for key, value in recs.items():
        name=key
        resString=value[0]
@ -16,11 +32,16 @@ def migrate(path):
        img=value[3].encode()

        r = Recipe(name=name, instructions=resString, url=link, img=img)
+
        for x, y in value[1].items():
-            a = Link(ingredient_amount=y)
-            a.ingredient = Ingredient(name=x)
+            a = Ingredient(name=x, ingredient_amount=y)
            r.ingredient.append(a)
+            for x in stemWord(a.name):
+                t = Trunk(name=x)
+                r.trunk.append(t)
        dbSession.add(r)
        dbSession.commit()
+        counter+=1
+        print(counter/leng)

 migrate('./data/recs.json')
--- a/mine.py
+++ b/mine.py
@ -8,7 +8,7 @@ import random
 import traceback
 import cv2
 import base64
-from application.db import Session, Recipe, Ingredient, Link, Trunk
+from application.db import Session, Recipe, Ingredient, Trunk
 import nltk as nltk
 from nltk.corpus import stopwords

@ -126,6 +126,8 @@ def getRecipe(links):
            sleep(random.randint(0, 5))
    return recs

+
+
 def stemIngred():
    dbSession = Session()
    stopset = set(stopwords.words('german'))
@ -135,7 +137,7 @@ def stemIngred():
    for x in dbSession.query(Ingredient).all():
        snowball = nltk.SnowballStemmer(language='german')
        for token in nltk.word_tokenize(x.name): 
-            if token in stopset or len(token) < 3:
+            if token in stopset or len(token) < 4:
                continue
            stemmed = snowball.stem(token)

@ -153,7 +155,7 @@ with open('./data/links.json') as file:
    

 #recs = getRecipe(links)
-stemIngred()
+#stemIngred()

 #with open('./data/recs.json', 'w', encoding="utf-8") as file:
 #    json.dump(recs, file, ensure_ascii=False)
--- a/test.py
+++ b/test.py
@ -1,12 +1,12 @@

-from application.db import Session, Recipe, Ingredient, Link, Trunk
+from application.db import Session, Recipe, Ingredient, Trunk
 import nltk as nltk
 from nltk.corpus import stopwords
 import time

 dbSession = Session()
-inputArr = ["butter", "milch", "eier", "käse"] 
-maxMissing = 4
+inputArr = ["butter", "milch", "eier", "mehl", "zucker"] 
+maxMissing = 10

 def slow():
    recipes = dbSession.query(Recipe).all()
@ -36,32 +36,36 @@ def slow():
 #        for xx in x:
 #            print(xx)

-def faster():
+def faster(inputArr):
    indx = {}
+ 
+
    for inpu in inputArr:
        ids = [] 
-        for x in dbSession.query(Ingredient).filter(Ingredient.name.contains(inpu)).all():
+        for x in dbSession.query(Trunk.recipe_id).filter(Trunk.name.contains(inpu)).all():
+            if str(x[0]) not in indx:
+                indx[str(x[0])] = 0

-            for y in x.recipe:
-                
-                if dbSession.query(Link).filter(Link.recipe_id==y.recipe_id).count() > len(inputArr) + maxMissing:
-                    continue   
-                if str(y.recipe_id) not in indx:
-                    indx[str(y.recipe_id)] = 0
-
-                indx[str(y.recipe_id)] += 1
+            indx[str(x[0])] += 1
        

-    outDict = {}
-    for key, value in indx.items():
-        ingred = dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().ingredients()
-        outDict[calcOverlay(inputArr, ingred)] = (dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().name, key, ingred)
-    
-    print(outDict)
+    return(indx)

-
-def fastes():
+def fastes(inputArr):
    indx = {}
+
+    for inpu in inputArr:
+        ids = [] 
+        for recipe_id in dbSession.query(Trunk.recipe_id).filter(Trunk.name == inpu).all():
+                
+
+            if str(recipe_id[0]) not in indx:
+                indx[str(recipe_id[0])] = 0
+
+            indx[str(recipe_id[0])] += 1
+    return(indx) 
+    
+def stemInput(inputArr):
    inputArr2 = []

    snowball = nltk.SnowballStemmer(language='german')
@ -72,26 +76,18 @@ def fastes():
             continue
        inputArr2.append(snowball.stem(word))

-    for inpu in inputArr2:
-        ids = [] 
-        for xx in dbSession.query(Trunk).filter(Trunk.name == inpu).all():
-            for x in dbSession.query(Ingredient).filter(xx.ingredient_id == Ingredient.ingredient_id).all():
-                for y in x.recipe:
-                    
-                    if dbSession.query(Link).filter(Link.recipe_id==y.recipe_id).count() > len(inputArr) + maxMissing:
-                        continue   
-                    if str(y.recipe_id) not in indx:
-                        indx[str(y.recipe_id)] = 0
-
-                    indx[str(y.recipe_id)] += 1
-        
-    outDict = {}
-    for key, value in indx.items():
-        ingred = dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().ingredients()
-        outDict[calcOverlay(inputArr, ingred)] = (dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().name, key, ingred)
-    
-    print(outDict)
+    return inputArr2
 #
+def printDict(indx):
+    outDict = {}
+    for key, value in sorted(indx.items()):
+        ingred = dbSession.query(Trunk.name).filter(Trunk.recipe_id==int(key)).all()
+        outDict[calcOverlay(inputArr, ingred)] = (dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().name, key, dbSession.query(Ingredient.name).filter(Ingredient.recipe_id==key).all())
+    
+    for key, value in outDict.items():
+        if key >= 0.5:
+            print(key, value)
+    

 def calcOverlay(l1, l2):
    snowball = nltk.SnowballStemmer(language='german')
@ -102,31 +98,31 @@ def calcOverlay(l1, l2):
    counter = 0

    for x in l2:
-        for token in nltk.word_tokenize(x): 
-            if token in stopset:
-                continue
-            stemmed = snowball.stem(token)
-            for l in l1:
-                if l == stemmed:
-                    counter +=1
-                     
+        for l in l1:
+            if l == x[0]:
+                #print(l)
+                counter +=1
+    counter = counter / len(l2)                 
    return counter
 #


 start = time.time()
-slow()
+#slow()
 end = time.time()
 print("\n", end - start, "\n")  

+stemmed = stemInput(inputArr)
+
+start = time.time()
+indx = faster(stemmed)  
+end = time.time()
+printDict(indx)
+print("\n", end - start, "\n")  
+

 start = time.time()
-faster()  
-end = time.time()
-print("\n", end - start, "\n")  
-
-
-start = time.time()
-fastes()
+indx = fastes(stemmed)
 end = time.time()
+printDict(indx)
 print("\n", end - start, "\n")