In [26]:
from utils import GensimModels
from nltk.stem import WordNetLemmatizer
import Config
import numpy as np
import random
import csv
train = Config.path_culture
feat_dim = 10
gensimLoader = GensimModels.GensimModels()
model_loaded = gensimLoader.load_word2vec(path=Config.path_embeddings_ingredients)
cult2id = {}
id2cult = []
comp2id = {'Nan':0}
id2comp = ['Nan']
comp2cnt = {'Nan':0}
train_cult = []
train_comp = []
train_comp_len = []
comp_thr = 5
max_comp_cnt = 0
filtred_comp = 0
train_f = open(train, 'r')
lines = train_f.readlines()[4:]
random.shuffle(lines)
train_thr = int(len(lines) * 0.8)
In [27]:
print "Build composer dictionary..."
for i, line in enumerate(lines):
tokens = line.strip().split(',')
culture = tokens[0]
composers = tokens[1:]
if cult2id.get(culture) is None:
cult2id[culture] = len(cult2id)
id2cult.append(culture)
if comp_thr > len(composers):
filtred_comp += 1
continue
#if max_comp_cnt < len(composers):
# max_comp_cnt = len(composers)
for composer in composers:
if comp2id.get(composer) is None:
comp2id[composer] = len(comp2id)
id2comp.append(composer)
comp2cnt[composer] = 0.
comp2cnt[composer] += 1
train_cult.append(cult2id.get(culture))
train_comp.append([comp2id.get(composer) for composer in composers])
In [28]:
for comp in train_comp:
train_comp_len.append(len(comp))
if len(comp) < max_comp_cnt:
comp += [0]*(max_comp_cnt - len(comp))
f = open('ingr_engine.csv', 'wb')
wr = csv.writer(f)
wnl = WordNetLemmatizer()
wv = model_loaded.wv
w = model_loaded.index2word
In [ ]:
#print [model_loaded[idx] for idx in w]
for i, idx in enumerate(w):
if idx not in id2comp:
wr.writerow([i, idx, False])
else:
wr.writerow([i, idx, True])
In [29]:
total_comp = 0.
for cnt in comp2cnt.values():
total_comp += cnt
print total_comp
In [30]:
mu, sigma = 0, 1
compid2vec = []
unk_cnt = 0
add_cnt = 0
call_cnt = 0.
for idx, comp in enumerate(id2comp):
if comp in wv:
wr.writerow([idx, comp, comp2cnt[comp]/total_comp*100, True])
compid2vec.append(model_loaded[comp])
call_cnt += comp2cnt[comp]
elif wnl.lemmatize(comp) in wv:
wr.writerow([idx, comp, comp2cnt[comp]/total_comp*100, 'Modified'])
compid2vec.append(model_loaded[wnl.lemmatize(comp)])
call_cnt += comp2cnt[comp]
add_cnt += 1
elif comp.rstrip().split('_')[-1] in wv:
wr.writerow([idx, comp, comp2cnt[comp]/total_comp*100, 'Modified'])
compid2vec.append(model_loaded[comp.rstrip().split('_')[-1]])
call_cnt += comp2cnt[comp]
add_cnt += 1
elif wnl.lemmatize(comp.rstrip().split('_')[-1]) in wv:
wr.writerow([idx, comp, comp2cnt[comp]/total_comp*100, 'Modified'])
compid2vec.append(model_loaded[wnl.lemmatize(comp.rstrip().split('_')[-1])])
call_cnt += comp2cnt[comp]
add_cnt += 1
else:
wr.writerow([idx, comp, comp2cnt[comp]/total_comp*100, False])
compid2vec.append(np.random.normal(mu, sigma, feat_dim))
unk_cnt += 1
f.close()
print "added cnt :", add_cnt
print "unk cnt :", unk_cnt, "in", len(id2comp)
print "call cnt :", call_cnt, "in", total_comp
print "filtered composer count is", filtred_comp
In [ ]: