In [26]:
from utils import GensimModels
from nltk.stem import WordNetLemmatizer
import Config
import numpy as np
import random
import csv

train = Config.path_culture
feat_dim = 10

gensimLoader = GensimModels.GensimModels()
model_loaded = gensimLoader.load_word2vec(path=Config.path_embeddings_ingredients)

cult2id = {}
id2cult = []
comp2id = {'Nan':0}
id2comp = ['Nan']
comp2cnt = {'Nan':0}

train_cult = []
train_comp = []
train_comp_len = []

comp_thr = 5 
max_comp_cnt = 0 
filtred_comp = 0 

train_f = open(train, 'r')
lines = train_f.readlines()[4:]
random.shuffle(lines)
train_thr = int(len(lines) * 0.8)

In [27]:
print "Build composer dictionary..."
for i, line in enumerate(lines):

    tokens = line.strip().split(',')
    culture = tokens[0]
    composers = tokens[1:]

    if cult2id.get(culture) is None:
        cult2id[culture] = len(cult2id)
        id2cult.append(culture)

    if comp_thr > len(composers):
        filtred_comp += 1
        continue

    #if max_comp_cnt < len(composers):
    #    max_comp_cnt = len(composers)

    for composer in composers:
        if comp2id.get(composer) is None:
            comp2id[composer] = len(comp2id)
            id2comp.append(composer)
            comp2cnt[composer] = 0.
        comp2cnt[composer] += 1

    train_cult.append(cult2id.get(culture))
    train_comp.append([comp2id.get(composer) for composer in composers])


Build composer dictionary...

In [28]:
for comp in train_comp:
    train_comp_len.append(len(comp))
    if len(comp) < max_comp_cnt:
        comp += [0]*(max_comp_cnt - len(comp))
        
f = open('ingr_engine.csv', 'wb')
wr = csv.writer(f)

wnl = WordNetLemmatizer()

wv = model_loaded.wv
w = model_loaded.index2word

In [ ]:
#print [model_loaded[idx] for idx in w]
for i, idx in enumerate(w):
    if idx not in id2comp:
        wr.writerow([i, idx, False])
    else:
        wr.writerow([i, idx, True])

In [29]:
total_comp = 0.
for cnt in comp2cnt.values():
    total_comp += cnt
print total_comp


861271.0

In [30]:
mu, sigma = 0, 1
compid2vec = []
unk_cnt = 0
add_cnt = 0
call_cnt = 0.
for idx, comp in enumerate(id2comp):
    if comp in wv:
        wr.writerow([idx, comp, comp2cnt[comp]/total_comp*100, True])
        compid2vec.append(model_loaded[comp])
        call_cnt += comp2cnt[comp]
    elif wnl.lemmatize(comp) in wv:
        wr.writerow([idx, comp, comp2cnt[comp]/total_comp*100, 'Modified'])
        compid2vec.append(model_loaded[wnl.lemmatize(comp)])
        call_cnt += comp2cnt[comp]
        add_cnt += 1
    elif comp.rstrip().split('_')[-1] in wv:
        wr.writerow([idx, comp, comp2cnt[comp]/total_comp*100, 'Modified'])
        compid2vec.append(model_loaded[comp.rstrip().split('_')[-1]])
        call_cnt += comp2cnt[comp]
        add_cnt += 1
    elif wnl.lemmatize(comp.rstrip().split('_')[-1]) in wv:
        wr.writerow([idx, comp, comp2cnt[comp]/total_comp*100, 'Modified'])
        compid2vec.append(model_loaded[wnl.lemmatize(comp.rstrip().split('_')[-1])])
        call_cnt += comp2cnt[comp]
        add_cnt += 1    
    else:
        wr.writerow([idx, comp, comp2cnt[comp]/total_comp*100, False])
        compid2vec.append(np.random.normal(mu, sigma, feat_dim))
        unk_cnt += 1
        
f.close()
print "added cnt :", add_cnt
print "unk cnt :", unk_cnt, "in", len(id2comp)
print "call cnt :", call_cnt, "in", total_comp
print "filtered composer count is", filtred_comp


added cnt : 547
unk cnt : 3098 in 3945
call cnt : 628219.0 in 861271.0
filtered composer count is 9498

In [ ]: