Word Anology Preprocessing


In [ ]:
import os
import collections
import smart_open
import random
import numpy as np

def load_ingredients(path):
    ingredients = {}
    ingredients_list = []
    with open(path, 'r') as f:
        for line in f:
            if line[0] == '#':
                pass
            else:
                line_split = line.rstrip().split('\t')
                ingredients_id = line_split[0]
                ingredients_list = line_split[1:]
                ingredients[ingredients_id] = ingredients_list
    return ingredients

In [ ]:
path_data = '..' + os.sep + 'data'
path_ingr_info = path_data + os.sep + 'scientific_report' + os.sep + 'ingr_info.tsv'
ingredients = load_ingredients(path_ingr_info)

In [ ]:
ingredients_list = []
for ingr_id in ingredients:
    ingredients_list.append(ingredients[ingr_id][0])

In [ ]:
target_ingr_list = ["flower","root","tree","leaf","seed","peel","grass"]


for target_ingr in target_ingr_list:
    #print target_ingr
    
    target_list = []
    # all ingredients to extract target ingredients
    for ingr in ingredients_list:
        if target_ingr in ingr:
            target_list.append(ingr)
    #print target_list

    # target ingrdients
    for target in target_list:
        target_split = target.split("_")
        
        if target_split[-1] == "oil":
            continue
        try:
            target_split.remove(target_ingr)
        except ValueError :  # 에러 종류
            continue

        target_combined = "_".join(target_split)

        if target_combined in ingredients_list:
            print "%s\t%s" % (target_combined, target)

Anology Test


In [ ]:
import os
import collections
import smart_open
import random
import numpy as np

from utils import DataLoader, GensimModels, DataPlotter
gensimLoader = GensimModels.GensimModels()
path_results = ".." + os.sep + "results"
path_embeddings_ingredients = path_results + os.sep + 'embeddings' + os.sep + "embeddings_ingredients_f-5_rs-True_ns-50_charemb-True_dim50.bin"
model = gensimLoader.load_word2vec(path=path_embeddings_ingredients)

# model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
# orange + citrus_oil - citrus_flower_oil = orange_flower

In [62]:
path_data = '..' + os.sep + 'data'
path_ingr_anol = path_data + os.sep + 'ingredient-anology-4.txt'

anology_dict = {}
anology_list = []

f_origin = open(path_ingr_anol, 'r')
#whole file
for line_origin in f_origin:
    if line_origin[0] == ':':
        continue
    #whole file 2
    f_target = open(path_ingr_anol, 'r')
    for line_target in f_target:
        if line_target[0] == ":":
            continue
            
        line_origin_split = line_origin.rstrip().split("\t")
        line_target_split = line_target.rstrip().split("\t")

        if line_origin_split == line_target_split:
            continue
        
        anology_list.append(line_origin_split + line_target_split)

In [63]:
path_data = '..' + os.sep + 'data'
path_ingr_anology_test = path_data + os.sep + 'ingredient-anology-test.txt'

with open(path_ingr_anology_test, 'r') as f:
    anology_list = []
    anology_list_all = []
    for line in f:
        line_split = line.split()
        anology_list.append(line_split)
        
        if line[0] != ":" and line[0] != "#":
            anology_list_all.append(line_split)
        
        if line[0] == '#':
            print anology_list[0]
            
            analogy_test(model, anology_list[1:-1])
            
            anology_list = []
        
    analogy_test(model, anology_list_all)


[':', 'cooked-ingredients', '"smoked","boiled","grilled","dried","raw"']

Total Number of Test Sets: 1406
Top1 Accuracy: 484, 34.423898%
Top5 Accuracy: 919, 65.362731%
Top10 Accuracy: 1095, 77.880512%
Top50 Accuracy: 1294, 92.034139%


[':', 'partof-ingredients', '"flower","root","tree","leaf","seed","peel","grass"']

Total Number of Test Sets: 2256
Top1 Accuracy: 2, 0.088652%
Top5 Accuracy: 13, 0.576241%
Top10 Accuracy: 19, 0.842199%
Top50 Accuracy: 55, 2.437943%


[':', 'extracted-ingredients', '"oil","juice"']

Total Number of Test Sets: 35910
Top1 Accuracy: 712, 1.982735%
Top5 Accuracy: 956, 2.662211%
Top10 Accuracy: 1021, 2.843219%
Top50 Accuracy: 1160, 3.230298%


[':', 'local-ingredients', '"japanese_,"chinese_,"california_,"thai_,"spanish_,"asian_,"ethiopian_,"american_"']

Total Number of Test Sets: 1056
Top1 Accuracy: 37, 3.503788%
Top5 Accuracy: 101, 9.564394%
Top10 Accuracy: 119, 11.268939%
Top50 Accuracy: 136, 12.878788%



Total Number of Test Sets: 40628
Top1 Accuracy: 1235, 3.039776%
Top5 Accuracy: 1989, 4.895638%
Top10 Accuracy: 2254, 5.547898%
Top50 Accuracy: 2645, 6.510288%



In [52]:
def analogy_test(word_vectors, anology_list):
    top1 = 0
    top5 = 0
    top10 = 0
    top50 = 0

    for anology in anology_list:
        word1 = anology[0] # positive
        word1_1 = anology[1] # answer
        word2 = anology[2] # negative
        word2_1 = anology[3] # positive

        try:
            list_most_similar = word_vectors.most_similar(positive=[word1_1, word2], negative=[word1], topn = 50)
            #list_most_similar_cosmul = word_vectors.most_similar_cosmul(positive=[word1, word2_1], negative=[word2], topn = 30)

        except KeyError as ke:
            #print ke
            continue
            
        #print "\nIngredient Analogy Test"
        #print "%s + %s - %s = %s" % (word1, word2_1, word2, word1_1)
        for ix, dic in enumerate(list_most_similar):
            word = dic[0]
            score = dic[1]

            if word2_1 == word:
                if ix+1 == 1:
                    top1 += 1
                if ix+1 <= 5:
                    top5 += 1
                if ix+1 <= 10:
                    top10 += 1
                if ix+1 <= 50:
                    top50 += 1
                
                #print "\nIngredient Analogy Test"
                #print "%s + %s - %s = %s" % (word1_1, word2, word1, word2_1)
                #print word, score, ix+1

    num_test_sets = len(anology_list)
    print "\nTotal Number of Test Sets:", num_test_sets
    print "Top1 Accuracy: %d, %f%%" % (top1, top1/float(num_test_sets)*100)
    print "Top5 Accuracy: %d, %f%%" % (top5, top5/float(num_test_sets)*100)
    print "Top10 Accuracy: %d, %f%%" % (top10, top10/float(num_test_sets)*100)
    print "Top50 Accuracy: %d, %f%%\n\n" % (top50, top50/float(num_test_sets)*100)

In [ ]: