In this notebook we'll demo that word2vec-like properties are kept. You can download the vectors, follow along at home, and make your own queries if you'd like.
Sums:
silicon valley ~ california + technology uber ~ taxis + companybaidu ~ china + search engineAnalogies:
Mark Zuckerberg - Facebook + Amazon = Jeff BezosHacker News - story + article = StackOverflowVIM - terminal + graphics = PhotoshopAnd slightly more whimsically:
vegeables - eat + drink = teascala - features + simple = haskell
In [37]:
!wget https://zenodo.org/record/49903/files/vocab.npy
In [36]:
!wget https://zenodo.org/record/49903/files/word_vectors.npy
You don't need to run the code below unless you've trained your own model. Otherwise, just download the word vectors from the URL above.
In [32]:
#from lda2vec_model import LDA2Vec
#from chainer import serializers
#import numpy as np
#import pandas as pd
#import pickle
#
#features = pd.read_pickle("../data/features.pd")
#vocab = np.load("../data/vocab")
#npz = np.load(open('topics.story.pyldavis.npz', 'r'))
#dat = {k: v for (k, v) in npz.iteritems()}
#vocab = dat['vocab'].tolist()
#dat = np.load("../data/data.npz")
#n_stories = features.story_id_codes.max() + 1
#n_units = 256
#n_vocab = dat['flattened'].max() + 1
#model = LDA2Vec(n_stories=n_stories, n_story_topics=40,
# n_authors=5664, n_author_topics=20,
# n_units=n_units, n_vocab=n_vocab, counts=np.zeros(n_vocab),
# n_samples=15)
#serializers.load_hdf5("/home/chris/lda2vec-12/examples/hacker_news/lda2vec/lda2vec.hdf5", model)
#np.save("word_vectors", model.sampler.W.data)
#np.save("vocab", vocab)
In [2]:
import numpy as np
word_vectors_raw = np.load("word_vectors.npy")
vocab = np.load("vocab.npy").tolist()
L2 Normalize the word vectors
In [15]:
word_vectors = word_vectors_raw / np.linalg.norm(word_vectors_raw, axis=-1)[:, None]
In [16]:
def get_vector(token):
index = vocab.index(token)
return word_vectors[index, :].copy()
def most_similar(token, n=20):
word_vector = get_vector(token)
similarities = np.dot(word_vectors, word_vector)
top = np.argsort(similarities)[::-1][:n]
return [vocab[i] for i in top]
# This is Levy & Goldberg's 3Cosmul Metric
# Based on the Gensim implementation: https://github.com/piskvorky/gensim/blob/master/gensim/models/word2vec.py
def cosmul(positives, negatives, topn=20):
positive = [get_vector(p) for p in positives]
negative = [get_vector(n) for n in negatives]
pos_dists = [((1 + np.dot(word_vectors, term)) / 2.) for term in positive]
neg_dists = [((1 + np.dot(word_vectors, term)) / 2.) for term in negative]
dists = np.prod(pos_dists, axis=0) / (np.prod(neg_dists, axis=0) + 1e-6)
idxs = np.argsort(dists)[::-1][:topn]
return [vocab[i] for i in idxs if (vocab[i] not in positives) and (vocab[i] not in negatives)]
def most_similar_posneg(positives, negatives, topn=20):
positive = np.sum([get_vector(p) for p in positives], axis=0)
negative = np.sum([get_vector(n) for n in negatives], axis=0)
vector = positive - negative
dists = np.dot(word_vectors, vector)
idxs = np.argsort(dists)[::-1][:topn]
return [vocab[i] for i in idxs if (vocab[i] not in positives) and (vocab[i] not in negatives)]
In [17]:
most_similar('san francisco')
Out[17]:
In [18]:
cosmul(['california', 'technology'], [], topn=20)
Out[18]:
In [19]:
cosmul(['digital', 'currency'], [], topn=20)
Out[19]:
In [20]:
cosmul(['text editor', 'terminal'], [], topn=20)
Out[20]:
In [35]:
cosmul(['china'], [], topn=20)
Out[35]:
In [21]:
cosmul(['china', 'search engine'], [], topn=20)
Out[21]:
In [22]:
cosmul(['microsoft'], [], topn=20)
Out[22]:
In [23]:
cosmul(['microsoft', 'cloud'], [], topn=20)
Out[23]:
Queen is several rankings down, so not exactly the same as out of the box word2vec!
In [24]:
cosmul(['king', 'woman'], ['man'], topn=20)
Out[24]:
In [25]:
print 'Most similar'
print '\n'.join(most_similar('mark zuckerberg'))
print '\nCosmul'
pos = ['mark zuckerberg', 'amazon']
neg = ['facebook']
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))
In [26]:
pos = ['hacker news', 'question']
neg = ['story']
print 'Most similar'
print '\n'.join(most_similar(pos[0]))
print '\nCosmul'
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))
In [27]:
pos = ['san francisco']
neg = []
print 'Most similar'
print '\n'.join(most_similar(pos[0]))
print '\nCosmul'
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))
In [28]:
pos = ['nlp', 'image']
neg = ['text']
print 'Most similar'
print '\n'.join(most_similar(pos[0]))
print '\nCosmul'
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))
In [29]:
pos = ['vim', 'graphics']
neg = ['terminal']
print 'Most similar'
print '\n'.join(most_similar(pos[0]))
print '\nCosmul'
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))
In [30]:
pos = ['vegetables', 'drink']
neg = ['eat']
print 'Most similar'
print '\n'.join(most_similar(pos[0]))
print '\nCosmul'
print '\n'.join(cosmul(pos, neg, topn=20))
print '\nTraditional Similarity'
print '\n'.join(most_similar_posneg(pos, neg, topn=20))
In [31]:
pos = ['lda', '']
neg = ['']
print 'Most similar'
print '\n'.join(most_similar(pos[0]))