In [ ]:
import os
import collections
import smart_open
import random
import numpy as np
def load_ingredients(path):
ingredients = {}
ingredients_list = []
with open(path, 'r') as f:
for line in f:
if line[0] == '#':
pass
else:
line_split = line.rstrip().split('\t')
ingredients_id = line_split[0]
ingredients_list = line_split[1:]
ingredients[ingredients_id] = ingredients_list
return ingredients
In [ ]:
path_data = '..' + os.sep + 'data'
path_ingr_info = path_data + os.sep + 'scientific_report' + os.sep + 'ingr_info.tsv'
ingredients = load_ingredients(path_ingr_info)
In [ ]:
ingredients_list = []
for ingr_id in ingredients:
ingredients_list.append(ingredients[ingr_id][0])
In [ ]:
target_ingr_list = ["flower","root","tree","leaf","seed","peel","grass"]
for target_ingr in target_ingr_list:
#print target_ingr
target_list = []
# all ingredients to extract target ingredients
for ingr in ingredients_list:
if target_ingr in ingr:
target_list.append(ingr)
#print target_list
# target ingrdients
for target in target_list:
target_split = target.split("_")
if target_split[-1] == "oil":
continue
try:
target_split.remove(target_ingr)
except ValueError : # 에러 종류
continue
target_combined = "_".join(target_split)
if target_combined in ingredients_list:
print "%s\t%s" % (target_combined, target)
In [ ]:
import os
import collections
import smart_open
import random
import numpy as np
from utils import DataLoader, GensimModels, DataPlotter
gensimLoader = GensimModels.GensimModels()
path_results = ".." + os.sep + "results"
path_embeddings_ingredients = path_results + os.sep + 'embeddings' + os.sep + "embeddings_ingredients_f-5_rs-True_ns-50_charemb-True_dim50.bin"
model = gensimLoader.load_word2vec(path=path_embeddings_ingredients)
# model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
# orange + citrus_oil - citrus_flower_oil = orange_flower
In [62]:
path_data = '..' + os.sep + 'data'
path_ingr_anol = path_data + os.sep + 'ingredient-anology-4.txt'
anology_dict = {}
anology_list = []
f_origin = open(path_ingr_anol, 'r')
#whole file
for line_origin in f_origin:
if line_origin[0] == ':':
continue
#whole file 2
f_target = open(path_ingr_anol, 'r')
for line_target in f_target:
if line_target[0] == ":":
continue
line_origin_split = line_origin.rstrip().split("\t")
line_target_split = line_target.rstrip().split("\t")
if line_origin_split == line_target_split:
continue
anology_list.append(line_origin_split + line_target_split)
In [63]:
path_data = '..' + os.sep + 'data'
path_ingr_anology_test = path_data + os.sep + 'ingredient-anology-test.txt'
with open(path_ingr_anology_test, 'r') as f:
anology_list = []
anology_list_all = []
for line in f:
line_split = line.split()
anology_list.append(line_split)
if line[0] != ":" and line[0] != "#":
anology_list_all.append(line_split)
if line[0] == '#':
print anology_list[0]
analogy_test(model, anology_list[1:-1])
anology_list = []
analogy_test(model, anology_list_all)
In [52]:
def analogy_test(word_vectors, anology_list):
top1 = 0
top5 = 0
top10 = 0
top50 = 0
for anology in anology_list:
word1 = anology[0] # positive
word1_1 = anology[1] # answer
word2 = anology[2] # negative
word2_1 = anology[3] # positive
try:
list_most_similar = word_vectors.most_similar(positive=[word1_1, word2], negative=[word1], topn = 50)
#list_most_similar_cosmul = word_vectors.most_similar_cosmul(positive=[word1, word2_1], negative=[word2], topn = 30)
except KeyError as ke:
#print ke
continue
#print "\nIngredient Analogy Test"
#print "%s + %s - %s = %s" % (word1, word2_1, word2, word1_1)
for ix, dic in enumerate(list_most_similar):
word = dic[0]
score = dic[1]
if word2_1 == word:
if ix+1 == 1:
top1 += 1
if ix+1 <= 5:
top5 += 1
if ix+1 <= 10:
top10 += 1
if ix+1 <= 50:
top50 += 1
#print "\nIngredient Analogy Test"
#print "%s + %s - %s = %s" % (word1_1, word2, word1, word2_1)
#print word, score, ix+1
num_test_sets = len(anology_list)
print "\nTotal Number of Test Sets:", num_test_sets
print "Top1 Accuracy: %d, %f%%" % (top1, top1/float(num_test_sets)*100)
print "Top5 Accuracy: %d, %f%%" % (top5, top5/float(num_test_sets)*100)
print "Top10 Accuracy: %d, %f%%" % (top10, top10/float(num_test_sets)*100)
print "Top50 Accuracy: %d, %f%%\n\n" % (top50, top50/float(num_test_sets)*100)
In [ ]: