In [1]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.charts import Scatter, show, output_file
from bokeh.models import LabelSet, ColumnDataSource
In [2]:
model = Word2Vec.load("300features_40minwords_10context_AnnouncementTitle")
#lower the dimention of each word vector
tsne=TSNE(random_state=0)
np.set_printoptions(suppress=True)
In [3]:
model.syn0.shape
Out[3]:
In [3]:
vec = np.empty((0,100), float)
dic= list(model.vocab.keys())
for i in dic:
vec = np.vstack((vec, model[i]))
low_dim = tsne.fit_transform(vec)
In [ ]:
## create word graph
df = pd.DataFrame(low_dim, columns=['x','y'])
df['vocab'] = dic
source = ColumnDataSource(data=dict(x=df['x'],
y=df['y'],
vocab=dic))
p = Scatter(df, x='x', y='y', title = "vocab in hypterdimention")
labels = LabelSet(x='x', y='y', text='vocab', level='glyph',
x_offset=5, y_offset=5, source=source, render_mode='canvas')
p.add_layout(labels)
# output_file("word2vec.html", title="word2vec")
show(p)