Very simple word2vec example @ nlintz's tutoral

In [21]:
import collections
import numpy as np
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  

print ("Packages loaded.")

Packages loaded.

In [22]:
# Configuration
batch_size     = 20
embedding_size = 2     # This is just for visualization
num_sampled    = 15    # Number of negative examples to sample.

In [23]:
# Sample sentences
sentences = ["the quick brown fox jumped over the lazy dog",
            "I love cats and dogs",
            "we all love cats and dogs",
            "cats and dogs are great",
            "sung likes cats",
            "she loves dogs",
            "cats can be very independent",
            "cats are great companions when they want to be",
            "cats are playful",
            "cats are natural hunters",
            "It's raining cats and dogs",
            "dogs and cats love sung"]
# 'sentences' is 'list' 
print ("'sentences' is %s and length is %d." 
       % (type(sentences), len(sentences)))

'sentences' is <class 'list'> and length is 12.

In [24]:
words = " ".join(sentences).split() 
print ("'words' is %s and length is %d." % (type(words), len(words)))
print (words)

'words' is <class 'list'> and length is 62.
['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', 'I', 'love', 'cats', 'and', 'dogs', 'we', 'all', 'love', 'cats', 'and', 'dogs', 'cats', 'and', 'dogs', 'are', 'great', 'sung', 'likes', 'cats', 'she', 'loves', 'dogs', 'cats', 'can', 'be', 'very', 'independent', 'cats', 'are', 'great', 'companions', 'when', 'they', 'want', 'to', 'be', 'cats', 'are', 'playful', 'cats', 'are', 'natural', 'hunters', "It's", 'raining', 'cats', 'and', 'dogs', 'dogs', 'and', 'cats', 'love', 'sung']

In [25]:
count = collections.Counter(words).most_common() 
print ("'count' is %s and length is %d." % (type(count), len(count)))
print (("Word count of top five is %s") % (count[:5]))
print (count)

'count' is <class 'list'> and length is 35.
Word count of top five is [('cats', 10), ('dogs', 6), ('and', 5), ('are', 4), ('love', 3)]
[('cats', 10), ('dogs', 6), ('and', 5), ('are', 4), ('love', 3), ('the', 2), ('be', 2), ('great', 2), ('sung', 2), ('jumped', 1), ('independent', 1), ('dog', 1), ('when', 1), ('natural', 1), ('very', 1), ('playful', 1), ('quick', 1), ('lazy', 1), ('to', 1), ('companions', 1), ('hunters', 1), ('likes', 1), ('brown', 1), ('they', 1), ('she', 1), ('want', 1), ('we', 1), ('fox', 1), ('I', 1), ("It's", 1), ('over', 1), ('all', 1), ('can', 1), ('raining', 1), ('loves', 1)]

In [26]:
print (words[0:5])
print (count[0:3])

['the', 'quick', 'brown', 'fox', 'jumped']
[('cats', 10), ('dogs', 6), ('and', 5)]

In [27]:
rdic = [i[0] for i in count] #reverse dic, idx -> word
dic = {w: i for i, w in enumerate(rdic)} #dic, word -> id
voc_size = len(dic) # Number of vocabulary 
print ("'rdic' is %s and length is %d." % (type(rdic), len(rdic)))
print ("'dic' is %s and length is %d." % (type(dic), len(dic)))

'rdic' is <class 'list'> and length is 35.
'dic' is <class 'dict'> and length is 35.

In [28]:
print (rdic)

['cats', 'dogs', 'and', 'are', 'love', 'the', 'be', 'great', 'sung', 'jumped', 'independent', 'dog', 'when', 'natural', 'very', 'playful', 'quick', 'lazy', 'to', 'companions', 'hunters', 'likes', 'brown', 'they', 'she', 'want', 'we', 'fox', 'I', "It's", 'over', 'all', 'can', 'raining', 'loves']

In [29]:
print (dic)
revierse_dic = {v: k for k, v in dic.items()}

{'the': 5, 'jumped': 9, 'independent': 10, 'great': 7, 'when': 12, 'natural': 13, 'very': 14, 'be': 6, 'playful': 15, 'quick': 16, 'lazy': 17, 'to': 18, 'companions': 19, 'hunters': 20, 'all': 31, 'cats': 0, 'dog': 11, 'likes': 21, 'brown': 22, 'they': 23, 'she': 24, 'want': 25, 'dogs': 1, 'we': 26, 'fox': 27, 'are': 3, 'sung': 8, 'I': 28, "It's": 29, 'over': 30, 'love': 4, 'and': 2, 'can': 32, 'raining': 33, 'loves': 34}
{0: 'cats', 1: 'dogs', 2: 'and', 3: 'are', 4: 'love', 5: 'the', 6: 'be', 7: 'great', 8: 'sung', 9: 'jumped', 10: 'independent', 11: 'dog', 12: 'when', 13: 'natural', 14: 'very', 15: 'playful', 16: 'quick', 17: 'lazy', 18: 'to', 19: 'companions', 20: 'hunters', 21: 'likes', 22: 'brown', 23: 'they', 24: 'she', 25: 'want', 26: 'we', 27: 'fox', 28: 'I', 29: "It's", 30: 'over', 31: 'all', 32: 'can', 33: 'raining', 34: 'loves'}

In [30]:
print (rdic[0])
print (dic['cats'])


In [31]:
data = [dic[word] for word in words]
print ("'data' is %s and length is %d." % (type(data), len(data)))
print('Sample data: numbers: %s / words: %s'% (data[:10], [rdic[t] for t in data[:10]]))

'data' is <class 'list'> and length is 62.
Sample data: numbers: [5, 16, 22, 27, 9, 30, 5, 17, 11, 28] / words: ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', 'I']

CBOW and SKIP Gram


In [32]:
cbow_pairs = [];
for i in range(1, len(data)-1) :
    cbow_pairs.append([[data[i-1], data[i+1]], data[i]]);
print('Context pairs: %s' % (cbow_pairs[:10]))

# the quick brown fox jumped over the lazy dog
revierse_dic[7], revierse_dic[26], revierse_dic[28]

Context pairs: [[[5, 22], 16], [[16, 27], 22], [[22, 9], 27], [[27, 30], 9], [[9, 5], 30], [[30, 17], 5], [[5, 11], 17], [[17, 28], 11], [[11, 4], 28], [[28, 0], 4]]
('great', 'we', 'I')

In [33]:
# (quick, the), (quick, brown), (brown, quick), (brown, fox), ...
# the quick brown fox jumped over the lazy dog

skip_gram_pairs = [];
for c in cbow_pairs:
    skip_gram_pairs.append([c[1], c[0][0]])
    skip_gram_pairs.append([c[1], c[0][1]])
print ("'skip_gram_pairs' is %s and length is %d."
       % (type(skip_gram_pairs), len(skip_gram_pairs)))
print('skip-gram pairs', skip_gram_pairs[:5])

# the quick brown fox jumped over the lazy dog
print(revierse_dic[28], revierse_dic[7])
print(revierse_dic[28], revierse_dic[26])

'skip_gram_pairs' is <class 'list'> and length is 120.
skip-gram pairs [[16, 5], [16, 22], [22, 16], [22, 27], [27, 22]]
I great
I we

In [34]:
def generate_batch(size):
    assert size < len(skip_gram_pairs)
    y_data = []
    r = np.random.choice(range(len(skip_gram_pairs)), size, replace=False)
    for i in r:
        x_data.append(skip_gram_pairs[i][0])  # n dim
        y_data.append([skip_gram_pairs[i][1]])  # n, 1 dim
    return x_data, y_data

# generate_batch test
x, y = generate_batch(3)
print ('Batches (x, y)', x,y)

print(revierse_dic[x[0]], revierse_dic[y[0][0]])
print(revierse_dic[x[1]], revierse_dic[y[1][0]])
print(revierse_dic[x[2]], revierse_dic[y[2][0]])

Batches (x, y) [29, 12, 5] [[33], [23], [17]]
It's raining
when they
the lazy

In [37]:
# Input data
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
# need to shape [batch_size, 1] for nn.nce_loss
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs) # lookup table

# Construct the variables for the NCE loss
nce_weights = tf.Variable(
    tf.random_uniform([voc_size, embedding_size],-1.0, 1.0))
nce_biases = tf.Variable(tf.zeros([voc_size]))

# Compute the average NCE loss for the batch.
# This does the magic:
#   tf.nn.nce_loss(weights, biases, inputs, labels, num_sampled, num_classes ...)
# It automatically draws negative samples when we evaluate the loss.
loss = tf.reduce_mean(
  tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                 num_sampled, voc_size))

# Use the adam optimizer
train_op = tf.train.AdamOptimizer(1e-1).minimize(loss)

In [38]:
# Launch the graph in a session
with tf.Session() as sess:
    # Initializing all variables

    for step in range(100):
        batch_inputs, batch_labels = generate_batch(batch_size)
        _, loss_val =[train_op, loss],
                feed_dict={train_inputs: batch_inputs, train_labels: batch_labels})
        if step % 10 == 0:
          print("Loss at ", step, loss_val) # Report the loss

    # Final embeddings are ready for you to use. Need to normalize for practical use
    trained_embeddings = embeddings.eval()

# Show word2vec if dim is 2
if trained_embeddings.shape[1] == 2:
    labels = rdic[:10] # Show top 10 words
    for i, label in enumerate(labels):
        x, y = trained_embeddings[i,:]
        plt.scatter(x, y)
        plt.annotate(label, xy=(x, y), xytext=(5, 2),
            textcoords='offset points', ha='right', va='bottom')

WARNING:tensorflow:From <ipython-input-38-c84d511d3853>:5 in <module>.: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
