word2vec is a technique for encoding words (or other tokens in a sequence) into high dimensional vectors. These vectors can be used for similarity lookups and arithmetic operations. The word2vec algorithm is implemented by gensim.
In [ ]:
from gensim.models import Word2Vec
Instead of spending days training our own model on lots of text, we can load a pre-trained model. This one was trained by Google on three million unique words and phrases in news articles. Each word is embedded in a 300-dimensional space. It's a 3.6GB file (compressed to 1.6GB) and can take almost two minutes to load this model from disk.
In [ ]:
model = Word2Vec.load_word2vec_format('models/GoogleNews-vectors-negative300.bin.gz', binary=True)
We can look up the vector for a single word like this:
In [ ]:
vector = model['Germany']
print(vector.shape)
In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=[30, 10])
plt.plot(vector)
plt.plot(model['China'])
plt.show()
In [ ]:
from textblob import TextBlob
def word_similar_cosmul_p(word,topn=10, do_print = True,words_only = False):
if do_print: print "word:",word
similarities = model.most_similar_cosmul([word],[],topn)
if do_print: print_similarities(similarities)
if words_only:
similarities = map(lambda sim : sim[0],similarities)
return similarities
def print_similarities(similarity_list):
for similarity in similarity_list:
print similarity[0].ljust(18),similarity[1]
def tag(word):
return TextBlob(word).tags[0]
#print tag('win')
If we sort the dimensions by one vector, we can see the similar words follow it, and the dissimilar one does not.
In [ ]:
plt.figure(figsize=[30, 3])
plt.plot(sorted(zip(model['Tuesday'], model['Wednesday'], model['Thursday'], model['teapot'])))
plt.show()
Using these vectors we can make metaphors, such as "What is the Berlin of Japan (instead of Germany)?" also written:
Berlin + Japan - Germany
:
In [ ]:
model.most_similar(positive=['Berlin', 'Japan'], negative=['Germany'])
Or, "What is a queen as a man (instead of a woman)?" Notice that the words are case sensitive.
queen + man - woman
In [ ]:
model.most_similar(positive=['queen', 'man'], negative=['woman'])
We can look for the most similar words in the 1000 most common words.
In [ ]:
limit = 1000
limited = model.index2word[:limit]
matches = [(model.most_similar(positive=[word], topn=1, restrict_vocab=limit)[0], word) for word in limited]
sorted([(x[1],x[0],y) for x,y in matches], reverse=True)[:200]
gensim also provides interfaces for how similar two words are.
In [ ]:
model.similarity('computer', 'calculator')
In [ ]:
model.similarity('computer', 'rain')
Or which words don't belong, like "Which one of these things is not like the other".
In [ ]:
plt.figure(figsize=[30, 8])
words = ['hammer','shoe','handsaw','pliers','king','space','astronaut']
for word in words:
plt.plot(model[word])
plt.show()
not_watch = model.doesnt_match(words) # try "saw" instead
sentence generation method: from the non-stop words replace the last matching word with same tag &: similar , different sentinement, translation to another area.
In [ ]:
not_watch
In [ ]:
model.doesnt_match("sweet sour salty wet".split()) # add umami
As you work with word2vec you will notice that opposites are more similar to each other than they are to very different things:
In [ ]:
model.similarity('hot', 'cold')
In [ ]:
model.similarity('hot', 'laptop')
For a set of words we can plot their similarities to each other, and see that each group of three is similar to each other but not to the others.
In [ ]:
import numpy as np
# words = [str(i) for i in range(9)]
# words = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelfth']
# words = ['January','February','March','April','May','June','July','August','September','October','November','December']
words = ['Monday', 'Britian', 'Cool', 'Barcelona', 'Tango', 'Art', 'Friday']
# words = ['breakfast', 'lunch', 'dinner','computer', 'desktop', 'laptop','sun', 'moon', 'stars']
vectors = [model[word] for word in words]
similar = [np.dot(vectors, vector) for vector in vectors]
plt.figure(figsize=(8, 6))
plt.pcolor(np.array(similar), cmap='viridis')
plt.colorbar()
plt.yticks(0.5 + np.arange(len(words)), words)
plt.xticks(0.5 + np.arange(len(words)), '' * len(words))
plt.show()
Another idea is to find a path from one word to another. Instead of doing a linear interpolation from one vector to another, we need to do a high-dimensional spherical interpolation. This example uses code from Transorthogonal Linguistics. They do some extra filtering to get better results, and you can test it here.
In [ ]:
from gensim import matutils
def slerp_points(x0,x1,slerp_n):
theta = np.arccos(x0.dot(x1))
st = np.sin(theta)
T = np.linspace(0,1,slerp_n)
L1 = np.sin((1-T)*theta)/st
L2 = np.sin(T*theta)/st
SL = np.outer(L1,x0) + np.outer(L2,x1)
return (SL.T / np.linalg.norm(SL,axis=1)).T
def print_path(start, end, steps=10, limit=10000, topn=10):
x0 = model[start]
x1 = model[end]
all_words = [start, end]
between = slerp_points(x0, x1, steps)
limited = model.syn0norm if limit is None else model.syn0norm[:limit]
for x in between:
sims = np.dot(limited, matutils.unitvec(x))
best = matutils.argsort(sims, topn=topn, reverse=True)
print ' '.join([model.index2word[i] for i in best if not model.index2word[i] in all_words])
In [ ]:
print_path('man', 'woman')
If you have a set of words you can sort them so that the path from one word to the next is always similar. This can be done with a traveling salesperson solver, like this one in Python or this one from Google. Here's a list of moods sorted by a traveling salesperson solver their word2vec vector distance: ...fearless courageous brave daring bold framed blank fake phony inflated manipulated...
We can also use PCA or t-SNE to plot a collection of word2vec vectors in 2d. Let's see if there is a principle axis that the "capital city" relationship exists along.
In [ ]:
plt.figure(figsize=(20, 5))
plt.plot(model['China']-model['Beijing'])
plt.plot(model['Japan']-model['Tokyo'])
# plt.plot(model['man']-model['woman'])
plt.show()
In [ ]:
'''
pairs = [
'China', 'Beijing',
'Japan', 'Tokyo',
'Russia', 'Moscow',
'South_Korea', 'Seoul',
'Indonesia', 'Jakarta',
'United_Kingdom', 'London',
'Peru', 'Lima',
'Thailand', 'Bangkok',
'Iran', 'Tehran',
'Egypt', 'Cairo',
'Germany','Berlin',
'Barcelona', 'Berlin']
'''
pairs = [
'good','better',
'fast','faster',
'big','bigger',
'small',model.most_similar(positive=['better', 'small'], negative=['good'])[0][0]]
# pairs = [
# 'eyes', 'glasses',
# 'hand', 'pencil',
# 'head', 'hat',
# 'feet', 'shoes',
# 'legs', 'pants']
pairs_vectors = [model[pair] for pair in pairs]
In [ ]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=2)
pairs_pca = pca.fit_transform(pairs_vectors)
In [ ]:
plt.figure(figsize=(8,8))
plt.plot(pairs_pca[:,0], pairs_pca[:,1], '.')
pair_pts = zip(pairs_pca[::2], pairs_pca[1::2])
pair_names = zip(pairs[::2], pairs[1::2])
for pt_pair, name_pair in zip(pair_pts, pair_names):
pt0, pt1 = pt_pair
plt.arrow(pt0[0], pt0[1], pt1[0]-pt0[0], pt1[1]-pt0[1])
name0, name1 = name_pair
plt.annotate(name0, pt0, va='top')
plt.annotate(name1, pt1, va='top')
plt.show()
If you look at the relationship between a bunch of countries, you'll see the most related countries ending up next to each other. Note: Some Countries are missing because they're not in word2vec. For more words check out corpora.
In [ ]:
words = []
with open('data/animals.txt') as f:
words = [line.strip() for line in f]
print(len(words))
print(words)
In [ ]:
from textblob import Word
words_vectors = []
print len(words)
print ">>>"
#add_similars(words,n)
max_words = len(words) * 2
for word in words:
try:
print len(words),max_words
word_tag = tag(word)
token = word.replace(' ', '_')
words_vectors.append(model[token])
#similar = model.most_similar(positive=['queen', 'man'], negative=['woman'])
similar_words = word_similar_cosmul_p(word,do_print = False)
for sim in similar_words:
# print sim
single = Word(sim[0]).singularize()
if tag(single)[1] == word_tag[1] and single != word and sim[1] > 0.7:
words_vectors.append(model[single])
words.append(single)
#print ">>",single
if max_words <= len(words):
break
if max_words <= len(words):
break
except KeyError:
print 'Ignoring "' + word + '"'
print len(words),words
In [ ]:
print len(words),words
In [ ]:
from random import randint
max = len(model.vocab) -1
print max
wordVocab = [k for (k, v) in model.vocab.iteritems()]
model.similarity(wordVocab[randint(0,max)],wordVocab[randint(0,max)])
words_vectors2 = []
for (i,k, v) in model.vocab.iteritems():
if(i < 10):
print k
words_vectors2.append(v)
print words_vectors2[3]
def export_vocab(model,filename):
word_dict = {}
for (k, v) in model.vocab.iteritems():
word_dict[k] = v
return word_dict
#word_dict = export_vocab(model,'model')
#len(word_dict)
In [ ]:
from sklearn import manifold
tsne = manifold.TSNE(n_components=2, perplexity=10, learning_rate=100, verbose=2)
#%time countries_tsne = tsne.fit_transform(countries_vectors)
%time countries_tsne = tsne.fit_transform(words_vectors2)
In [ ]:
plt.figure(figsize=(30,30))
plt.plot(countries_tsne[:,0], countries_tsne[:,1], '.')
for pt, name in zip(countries_tsne, words):
plt.annotate(name, pt)
plt.axis('off')
plt.savefig("test.svg", format="svg")
plt.show()
In [ ]:
import rasterfairy
#print countries_tsne
num = len(countries_tsne)
arrangements = rasterfairy.getArrangements(num)
masks = rasterfairy.arrangementListToRasterMasks(arrangements)
c = 0
for mask in masks:
print c, "Type:",mask['type'],"\tProportion:",mask['width'],"x",mask['height'],"\tHexagonal:",mask['hex']
c += 1
rasterMask = masks[0]
grid_xy, (width,height) = rasterfairy.transformPointCloud2D(countries_tsne,target=rasterMask)
In [ ]:
plt.figure(figsize=(80,30))
plt.plot(grid_xy[:,0], grid_xy[:,1], '.')
for pt, name in zip(grid_xy, words):
plt.annotate(name, pt)
plt.axis('off')
plt.savefig("test.svg", format="svg")
plt.show()
wörter verbinden. die k nächsten sätze anzeigen. constellations