In [1]:
#looking at gensim
import gensim, logging
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
import numpy as np
import sys
sys.path.append("../../rnn_disf_detection")
from data.load import switchboardfold
%matplotlib inline
In [2]:
new_model = gensim.models.Word2Vec.load('bnc_swbd_clean_50')
In [3]:
new_model.most_similar("you")
Out[3]:
In [4]:
new_model.most_similar("sad")
Out[4]:
In [5]:
print new_model.layer1_size
In [6]:
def plot_word_spread(model, vocab, special_vocab):
# global dictionary
X = np.array([model[word] for word in vocab],dtype="float")
#print X[0:10]
#return
#tsne_model = manifold.t_sne.TSNE()
tsne_model = TSNE(random_state=0)
X_2d = tsne_model.fit_transform(X)
plt.figure(figsize=(15,10))
plt.scatter(X_2d[:,0], X_2d[:,1])
for no, word in zip(range(0,len(vocab)),vocab):
if word in special_vocab:
colour="red"
else:
colour = "blue"
plt.text(X_2d[no,0] + np.random.normal(0, 1),
X_2d[no,1] + np.random.normal(0, 1), word, color=colour)
plt.show()
plt.savefig('embeddings2.png',format="png")
In [7]:
#get dict and data from SWBD
train_set, valid_set, test_set, valid_set_alltags, test_set_alltags, dic, train_dict = switchboardfold()
print str(len(dic['words2idx'].items())) + " words in vocab"
In [8]:
#get embeddings from path
emb_path1 = "/Users/julianhough/Bender/disf-elman-forward-10/epoch_13/embeddings.npy"
emb = np.load(emb_path1)
In [9]:
#build a simple dict, like the model which is vocab -> embedding vector slice
fillers = ['uh', 'um', 'oh']
lim = 300; count = 0
full_vocab = dict()
my_vocab = []
for key, val in sorted(dic['words2idx'].items(), key=lambda x: x[1]):
my_vocab.append(key)
full_vocab[key] = emb[val]
count+=1
if count>lim: break
In [10]:
for f in fillers:
full_vocab[f] = emb[dic['words2idx'][f]]
my_vocab.extend(fillers)
In [11]:
plot_word_spread(full_vocab, my_vocab, fillers)
In [ ]: