In [1]:
import gensim
# Load pre-trained Word2Vec model.
# model = gensim.models.KeyedVectors.load('data/w2v-186891476-50000-300-5-5-plwiki-20170820.bin')
# model = gensim.models.KeyedVectors.load("data/w2v-2155073-50000-300-5-5-plwikibooks-20170820.bin")
# model = gensim.models.KeyedVectors.load("data/w2v-2427886-50000-300-5-5-plwiktionary-20170820.bin")
model = gensim.models.KeyedVectors.load('data/w2v-773752559-1000000-300-5-5-OpenSubtitles2016.bin')
In [2]:
model.most_similar(positive=['kobieta', 'król'], negative=['mężczyzna'])
Out[2]:
In [3]:
model.most_similar_cosmul(positive=['kobieta', 'brat'], negative=['mężczyzna'])
Out[3]:
In [24]:
model.most_similar(positive=['żonaty'], negative=['kobieta'])
Out[24]:
In [4]:
model.wv.similarity("włączyć", "uruchomić")
Out[4]:
In [5]:
model.most_similar(positive=['żółty'])
Out[5]:
In [6]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
top_n=50
top_words=model.wv.index2word[1000:1000+top_n]
top_words=np.append(np.array(top_words),np.array([np.array(model.most_similar_cosmul(w, topn=8))[:,0] for w in top_words]))
top_words=top_words.flatten()
top_words=set(top_words)
In [7]:
word_weights=[model.wv.word_vec(w, use_norm=False) for w in top_words]
word_indexes=list(top_words)
In [8]:
viz_words = len(word_weights)
tsne = TSNE()
#embed_tsne = tsne.fit_transform(model.wv.syn0[:viz_words, :])
embed_tsne = tsne.fit_transform(word_weights)
In [9]:
fig, ax = plt.subplots(figsize=(14, 14))
for idx in range(viz_words):
plt.scatter(*embed_tsne[idx, :], color='steelblue')
plt.annotate(word_indexes[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)
In [ ]:
In [ ]: