In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
from pylab import rcParams
import mpld3
import sys
import numpy as np
import random
import math
import tensorflow as tf
import matplotlib.pyplot as plt
np.random.seed(0)
sys.path.append("../Utils/")
rcParams['figure.figsize'] = 8, 8
mpld3.enable_notebook()
In [3]:
from readWikiData import *
In [4]:
def build_co_occur_matrix(sentences, vocab_size, window_size, cc_matrix=None):
X = np.zeros((vocab_size, vocab_size))
for sentence in sentences:
n = len(sentence)
for i in xrange(n):
wi = sentence[i]
start = max(0, i - window_size)
end = min(n, i + window_size)
if i - window_size < 0:
points = 1.0/(1 + i)
X[wi, 0] += points
X[0, wi] += points
if i + window_size > n:
points = 1.0/( n - i)
X[wi, 0] += points
X[0, wi] += points
for j in xrange(start, i):
points = 1.0/(i - j)
wj = sentence[j]
X[wi, wj] += points
X[wj, wi] += points
for j in xrange(i + 1, end):
points = 1.0/(j - i)
wj = sentence[j]
X[wi, wj] += points
X[wj, wi] += points
#np.save(X, cc_matrix)
return X
Glove model normalize the co-occurence score of each using the following equation :-
In [5]:
def getfX(X, xmax, alpha):
fX = np.zeros(X.shape)
fX[X < xmax] = (X[X<xmax]/float(xmax))**alpha
fX[X >= xmax] = 1.0
return fX
The cost function in the glove model tries to learn a lower dimensional representation of the input coocurrence matrix using the following equation :-
In [6]:
def normalize(X):
return np.log(X + 1)
In [8]:
sentences, word2idx, idx2word_dict, original_sentence = get_wikipedia_data(n_files=10, n_vocab=2000, by_paragraph=True)
In [9]:
X = build_co_occur_matrix(sentences=sentences, vocab_size=len(word2idx), window_size=3)
In [10]:
fX = getfX(X=X, xmax=100, alpha=0.75)
In [11]:
logX = normalize(X=X)
In [12]:
V = len(word2idx)
embedding_size = 50
learning_rate = 10e-5
epochs = 10001
reg=0.01
In [13]:
W = np.random.randn(V, embedding_size)/np.sqrt(V + embedding_size)
b = np.zeros(V)
U = np.random.randn(V, embedding_size)/np.sqrt(V + embedding_size)
c = np.zeros(V)
mu = normalize(X).mean()
In [14]:
tfW = tf.Variable(W.astype(np.float32))
tfb = tf.Variable(b.reshape(V, 1).astype(np.float32))
tfU = tf.Variable(U.astype(np.float32))
tfc = tf.Variable(c.reshape(1, V).astype(np.float32))
tfLogX = tf.placeholder(tf.float32, shape=(V, V))
tffX = tf.placeholder(tf.float32, shape=(V, V))
In [15]:
delta = tf.matmul(tfW, tf.transpose(tfU)) + tfb + tfc + mu - tfLogX
cost = tf.reduce_sum(tffX * delta * delta)
for param in (tfW, tfb, tfU, tfc):
cost += reg*tf.reduce_sum(param * param)
In [16]:
train_op = tf.train.MomentumOptimizer(learning_rate, momentum=0.9).minimize(cost)
init = tf.global_variables_initializer()
session = tf.InteractiveSession()
session.run(init)
In [17]:
costs = []
sentence_indexes = range(len(sentences))
for epoch in xrange(epochs):
delta = W.dot(U.T) + b.reshape(V, 1) + c.reshape(1, V) + mu - logX
cost = ( fX * delta * delta ).sum()
costs.append(cost)
if epoch % 1000 == 0:
print "epoch:", epoch, "cost:", cost
session.run(train_op, feed_dict={tfLogX: logX, tffX: fX})
W, b, U, c = session.run([tfW, tfb, tfU, tfc])
In [18]:
plt.plot(costs)
plt.show()
In [ ]:
In [19]:
word2vec = np.mean([W, U], axis=0)
In [20]:
idx2word = {v:k for k, v in word2idx.items()}
In [21]:
from sklearn.manifold import TSNE
model = TSNE()
Z = model.fit_transform(word2vec)
In [22]:
plt.scatter(Z[:,0], Z[:,1])
plt.title("glove word2vec")
rcParams['figure.figsize'] = 12, 12
for i in xrange(len(idx2word)):
try:
plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i,0], Z[i,1]))
except:
print "bad string:", idx2word[i]
plt.show()
In [23]:
def find_analogies(w1, w2, w3, We, word2idx):
king = We[word2idx[w1]]
man = We[word2idx[w2]]
woman = We[word2idx[w3]]
v0 = king - man + woman
def dist1(a, b):
return np.linalg.norm(a - b)
def dist2(a, b):
return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))
for dist, name in [(dist1, 'Euclidean'), (dist2, 'cosine')]:
min_dist = float('inf')
best_word = ''
for word, idx in word2idx.iteritems():
if word not in (w1, w2, w3):
v1 = We[idx]
d = dist(v0, v1)
if d < min_dist:
min_dist = d
best_word = word
print "closest match by", name, "distance:", best_word
print w1, "-", w2, "=", best_word, "-", w3
In [24]:
we = word2vec
w2i = word2idx
In [25]:
find_analogies(w1='1', w2='3', w3='2',We=we, word2idx=w2i)
find_analogies(w1='england', w2='london', w3='france',We=we, word2idx=w2i)
find_analogies(w1='english', w2='french', w3='german',We=we, word2idx=w2i)
find_analogies(w1='east', w2='west', w3='north',We=we, word2idx=w2i)
find_analogies(w1='lower', w2='higher', w3='low',We=we, word2idx=w2i)
find_analogies(w1='two', w2='three', w3='four',We=we, word2idx=w2i)
find_analogies(w1='december', w2='november', w3='january',We=we, word2idx=w2i)
find_analogies(w1='december', w2='november', w3='july',We=we, word2idx=w2i)
find_analogies(w1='two', w2='three', w3='five',We=we, word2idx=w2i)
find_analogies(w1='eastern', w2='western', w3='northern',We=we, word2idx=w2i)
find_analogies(w1='king', w2='queen', w3='women',We=we, word2idx=w2i)
In [26]:
words = ['east', 'eastern', 'west', 'western', 'north', 'northern', 'south', 'southern']
idx = [word2idx[w] for w in words]
tsne = TSNE()
Z = tsne.fit_transform(we)
Z = Z[idx]
plt.scatter(Z[:,0], Z[:,1])
for i in xrange(len(words)):
plt.annotate(s=words[i], xy=(Z[i,0], Z[i,1]))
plt.show()
In [27]:
words = ['japan', 'japanese', 'england', 'english', 'australia', 'australian', 'china', \
'chinese', 'italy', 'italian', 'french', 'france', 'spain', 'spanish']
idx = [word2idx[w] for w in words]
tsne = TSNE()
Z = tsne.fit_transform(we)
Z = Z[idx]
plt.scatter(Z[:,0], Z[:,1])
for i in xrange(len(words)):
plt.annotate(s=words[i], xy=(Z[i,0], Z[i,1]))
plt.show()
In [28]:
words = ['large', 'larger', 'largest',
'great', 'greater', 'greatest',
'high', 'higher', 'highest']
idx = [word2idx[w] for w in words]
tsne = TSNE()
Z = tsne.fit_transform(we)
Z = Z[idx]
plt.scatter(Z[:,0], Z[:,1])
for i in xrange(len(words)):
plt.annotate(s=words[i], xy=(Z[i,0], Z[i,1]))
plt.show()
In [ ]: