sorted by overlap works, much simpler db model

This commit is contained in:
Askill 2020-04-15 14:16:28 +02:00
parent ffc41d617b
commit 853b749fde
5 changed files with 90 additions and 77 deletions

View File

@ -13,18 +13,7 @@ Base = declarative_base()
Session = sessionmaker(bind=engine)
# https://docs.sqlalchemy.org/en/13/orm/basic_relationships.html#association-object
class Link(Base):
__tablename__ = 'link'
recipe_id = Column(Integer, ForeignKey('recipe.recipe_id'), primary_key=True)
ingredient_id = Column(Integer, ForeignKey('ingredient.ingredient_id'), primary_key=True)
ingredient_amount = Column('ingredient_amount', Text)
ingredient_amount_mu = Column('ingredient_amount_mu', Text) # measurement unit
recipe = relationship("Recipe", back_populates="ingredient")
ingredient = relationship("Ingredient", back_populates="recipe")
def ingredients(self):
return self.ingredient.name
class Recipe(Base):
__tablename__ = "recipe"
@ -33,7 +22,8 @@ class Recipe(Base):
instructions = Column('instructions', Text)
url = Column('url', Text)
img = Column('img', LargeBinary)
ingredient = relationship("Link", back_populates="recipe")
ingredient = relationship("Ingredient", backref="recipe")
trunk = relationship("Trunk", backref="recipe")
def ingredients(self):
l = []
@ -69,14 +59,18 @@ class Ingredient(Base):
__tablename__ = "ingredient"
ingredient_id = Column('ingredient_id', Integer, primary_key=True, autoincrement=True)
name = Column('name', Text)
recipe = relationship("Link", back_populates="ingredient")
trunks = relationship("Trunk")
ingredient_amount = Column('ingredient_amount', Text)
ingredient_amount_mu = Column('ingredient_amount_mu', Text) # measurement unit
recipe_id = Column(Integer, ForeignKey('recipe.recipe_id'))
class Trunk(Base):
__tablename__ = "trunk"
trunk_id = Column('trunk_id', Integer, primary_key=True, autoincrement=True)
name = Column('name', Text)
ingredient_id = Column(Integer, ForeignKey('ingredient.ingredient_id'))
recipe_id = Column(Integer, ForeignKey('recipe.recipe_id'))
Base.metadata.create_all(engine)

View File

@ -1,7 +1,21 @@
import json
import cv2
import base64
from application.db import Session, Recipe, Ingredient, Link, Trunk
import nltk as nltk
from nltk.corpus import stopwords
from application.db import Session, Recipe, Ingredient, Trunk
def stemWord(word):
arr = []
stopset = set(stopwords.words('german'))
stopset |= set("(),")
snowball = nltk.SnowballStemmer(language='german')
for token in nltk.word_tokenize(word):
if token in stopset or len(token) < 4:
continue
stemmed = snowball.stem(token)
arr.append(stemmed)
return arr
def migrate(path):
recs = ""
@ -9,6 +23,8 @@ def migrate(path):
recs = json.load(file)
dbSession = Session()
counter = 0
leng = len(recs)
for key, value in recs.items():
name=key
resString=value[0]
@ -16,11 +32,16 @@ def migrate(path):
img=value[3].encode()
r = Recipe(name=name, instructions=resString, url=link, img=img)
for x, y in value[1].items():
a = Link(ingredient_amount=y)
a.ingredient = Ingredient(name=x)
a = Ingredient(name=x, ingredient_amount=y)
r.ingredient.append(a)
for x in stemWord(a.name):
t = Trunk(name=x)
r.trunk.append(t)
dbSession.add(r)
dbSession.commit()
counter+=1
print(counter/leng)
migrate('./data/recs.json')

View File

@ -8,7 +8,7 @@ import random
import traceback
import cv2
import base64
from application.db import Session, Recipe, Ingredient, Link, Trunk
from application.db import Session, Recipe, Ingredient, Trunk
import nltk as nltk
from nltk.corpus import stopwords
@ -126,6 +126,8 @@ def getRecipe(links):
sleep(random.randint(0, 5))
return recs
def stemIngred():
dbSession = Session()
stopset = set(stopwords.words('german'))
@ -135,7 +137,7 @@ def stemIngred():
for x in dbSession.query(Ingredient).all():
snowball = nltk.SnowballStemmer(language='german')
for token in nltk.word_tokenize(x.name):
if token in stopset or len(token) < 3:
if token in stopset or len(token) < 4:
continue
stemmed = snowball.stem(token)
@ -153,7 +155,7 @@ with open('./data/links.json') as file:
#recs = getRecipe(links)
stemIngred()
#stemIngred()
#with open('./data/recs.json', 'w', encoding="utf-8") as file:
# json.dump(recs, file, ensure_ascii=False)

108
test.py
View File

@ -1,12 +1,12 @@
from application.db import Session, Recipe, Ingredient, Link, Trunk
from application.db import Session, Recipe, Ingredient, Trunk
import nltk as nltk
from nltk.corpus import stopwords
import time
dbSession = Session()
inputArr = ["butter", "milch", "eier", "käse"]
maxMissing = 4
inputArr = ["butter", "milch", "eier", "mehl", "zucker"]
maxMissing = 10
def slow():
recipes = dbSession.query(Recipe).all()
@ -36,32 +36,36 @@ def slow():
# for xx in x:
# print(xx)
def faster():
def faster(inputArr):
indx = {}
for inpu in inputArr:
ids = []
for x in dbSession.query(Ingredient).filter(Ingredient.name.contains(inpu)).all():
for x in dbSession.query(Trunk.recipe_id).filter(Trunk.name.contains(inpu)).all():
if str(x[0]) not in indx:
indx[str(x[0])] = 0
for y in x.recipe:
if dbSession.query(Link).filter(Link.recipe_id==y.recipe_id).count() > len(inputArr) + maxMissing:
continue
if str(y.recipe_id) not in indx:
indx[str(y.recipe_id)] = 0
indx[str(y.recipe_id)] += 1
indx[str(x[0])] += 1
outDict = {}
for key, value in indx.items():
ingred = dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().ingredients()
outDict[calcOverlay(inputArr, ingred)] = (dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().name, key, ingred)
print(outDict)
return(indx)
def fastes():
def fastes(inputArr):
indx = {}
for inpu in inputArr:
ids = []
for recipe_id in dbSession.query(Trunk.recipe_id).filter(Trunk.name == inpu).all():
if str(recipe_id[0]) not in indx:
indx[str(recipe_id[0])] = 0
indx[str(recipe_id[0])] += 1
return(indx)
def stemInput(inputArr):
inputArr2 = []
snowball = nltk.SnowballStemmer(language='german')
@ -72,26 +76,18 @@ def fastes():
continue
inputArr2.append(snowball.stem(word))
for inpu in inputArr2:
ids = []
for xx in dbSession.query(Trunk).filter(Trunk.name == inpu).all():
for x in dbSession.query(Ingredient).filter(xx.ingredient_id == Ingredient.ingredient_id).all():
for y in x.recipe:
if dbSession.query(Link).filter(Link.recipe_id==y.recipe_id).count() > len(inputArr) + maxMissing:
continue
if str(y.recipe_id) not in indx:
indx[str(y.recipe_id)] = 0
indx[str(y.recipe_id)] += 1
outDict = {}
for key, value in indx.items():
ingred = dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().ingredients()
outDict[calcOverlay(inputArr, ingred)] = (dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().name, key, ingred)
print(outDict)
return inputArr2
#
def printDict(indx):
outDict = {}
for key, value in sorted(indx.items()):
ingred = dbSession.query(Trunk.name).filter(Trunk.recipe_id==int(key)).all()
outDict[calcOverlay(inputArr, ingred)] = (dbSession.query(Recipe).filter(Recipe.recipe_id==key).first().name, key, dbSession.query(Ingredient.name).filter(Ingredient.recipe_id==key).all())
for key, value in outDict.items():
if key >= 0.5:
print(key, value)
def calcOverlay(l1, l2):
snowball = nltk.SnowballStemmer(language='german')
@ -102,31 +98,31 @@ def calcOverlay(l1, l2):
counter = 0
for x in l2:
for token in nltk.word_tokenize(x):
if token in stopset:
continue
stemmed = snowball.stem(token)
for l in l1:
if l == stemmed:
counter +=1
for l in l1:
if l == x[0]:
#print(l)
counter +=1
counter = counter / len(l2)
return counter
#
start = time.time()
slow()
#slow()
end = time.time()
print("\n", end - start, "\n")
stemmed = stemInput(inputArr)
start = time.time()
indx = faster(stemmed)
end = time.time()
printDict(indx)
print("\n", end - start, "\n")
start = time.time()
faster()
end = time.time()
print("\n", end - start, "\n")
start = time.time()
fastes()
indx = fastes(stemmed)
end = time.time()
printDict(indx)
print("\n", end - start, "\n")