Very simple word2vec example @ nlintz's tutoral


In [21]:
import collections
import numpy as np
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  

print ("Packages loaded.")


Packages loaded.

In [22]:
# Configuration
batch_size     = 20
embedding_size = 2     # This is just for visualization
num_sampled    = 15    # Number of negative examples to sample.

In [23]:
# Sample sentences
sentences = ["the quick brown fox jumped over the lazy dog",
            "I love cats and dogs",
            "we all love cats and dogs",
            "cats and dogs are great",
            "sung likes cats",
            "she loves dogs",
            "cats can be very independent",
            "cats are great companions when they want to be",
            "cats are playful",
            "cats are natural hunters",
            "It's raining cats and dogs",
            "dogs and cats love sung"]
# 'sentences' is 'list' 
print ("'sentences' is %s and length is %d." 
       % (type(sentences), len(sentences)))


'sentences' is <class 'list'> and length is 12.

In [24]:
words = " ".join(sentences).split() 
print ("'words' is %s and length is %d." % (type(words), len(words)))
print (words)


'words' is <class 'list'> and length is 62.
['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', 'I', 'love', 'cats', 'and', 'dogs', 'we', 'all', 'love', 'cats', 'and', 'dogs', 'cats', 'and', 'dogs', 'are', 'great', 'sung', 'likes', 'cats', 'she', 'loves', 'dogs', 'cats', 'can', 'be', 'very', 'independent', 'cats', 'are', 'great', 'companions', 'when', 'they', 'want', 'to', 'be', 'cats', 'are', 'playful', 'cats', 'are', 'natural', 'hunters', "It's", 'raining', 'cats', 'and', 'dogs', 'dogs', 'and', 'cats', 'love', 'sung']

In [25]:
count = collections.Counter(words).most_common() 
print ("'count' is %s and length is %d." % (type(count), len(count)))
print (("Word count of top five is %s") % (count[:5]))
print (count)


'count' is <class 'list'> and length is 35.
Word count of top five is [('cats', 10), ('dogs', 6), ('and', 5), ('are', 4), ('love', 3)]
[('cats', 10), ('dogs', 6), ('and', 5), ('are', 4), ('love', 3), ('the', 2), ('be', 2), ('great', 2), ('sung', 2), ('jumped', 1), ('independent', 1), ('dog', 1), ('when', 1), ('natural', 1), ('very', 1), ('playful', 1), ('quick', 1), ('lazy', 1), ('to', 1), ('companions', 1), ('hunters', 1), ('likes', 1), ('brown', 1), ('they', 1), ('she', 1), ('want', 1), ('we', 1), ('fox', 1), ('I', 1), ("It's", 1), ('over', 1), ('all', 1), ('can', 1), ('raining', 1), ('loves', 1)]

In [26]:
print (words[0:5])
print (count[0:3])


['the', 'quick', 'brown', 'fox', 'jumped']
[('cats', 10), ('dogs', 6), ('and', 5)]

In [27]:
rdic = [i[0] for i in count] #reverse dic, idx -> word
dic = {w: i for i, w in enumerate(rdic)} #dic, word -> id
voc_size = len(dic) # Number of vocabulary 
print ("'rdic' is %s and length is %d." % (type(rdic), len(rdic)))
print ("'dic' is %s and length is %d." % (type(dic), len(dic)))


'rdic' is <class 'list'> and length is 35.
'dic' is <class 'dict'> and length is 35.

In [28]:
print (rdic)


['cats', 'dogs', 'and', 'are', 'love', 'the', 'be', 'great', 'sung', 'jumped', 'independent', 'dog', 'when', 'natural', 'very', 'playful', 'quick', 'lazy', 'to', 'companions', 'hunters', 'likes', 'brown', 'they', 'she', 'want', 'we', 'fox', 'I', "It's", 'over', 'all', 'can', 'raining', 'loves']

In [29]:
print (dic)
revierse_dic = {v: k for k, v in dic.items()}
print(revierse_dic)


{'the': 5, 'jumped': 9, 'independent': 10, 'great': 7, 'when': 12, 'natural': 13, 'very': 14, 'be': 6, 'playful': 15, 'quick': 16, 'lazy': 17, 'to': 18, 'companions': 19, 'hunters': 20, 'all': 31, 'cats': 0, 'dog': 11, 'likes': 21, 'brown': 22, 'they': 23, 'she': 24, 'want': 25, 'dogs': 1, 'we': 26, 'fox': 27, 'are': 3, 'sung': 8, 'I': 28, "It's": 29, 'over': 30, 'love': 4, 'and': 2, 'can': 32, 'raining': 33, 'loves': 34}
{0: 'cats', 1: 'dogs', 2: 'and', 3: 'are', 4: 'love', 5: 'the', 6: 'be', 7: 'great', 8: 'sung', 9: 'jumped', 10: 'independent', 11: 'dog', 12: 'when', 13: 'natural', 14: 'very', 15: 'playful', 16: 'quick', 17: 'lazy', 18: 'to', 19: 'companions', 20: 'hunters', 21: 'likes', 22: 'brown', 23: 'they', 24: 'she', 25: 'want', 26: 'we', 27: 'fox', 28: 'I', 29: "It's", 30: 'over', 31: 'all', 32: 'can', 33: 'raining', 34: 'loves'}

In [30]:
print (rdic[0])
print (dic['cats'])


cats
0

In [31]:
data = [dic[word] for word in words]
print ("'data' is %s and length is %d." % (type(data), len(data)))
print('Sample data: numbers: %s / words: %s'% (data[:10], [rdic[t] for t in data[:10]]))


'data' is <class 'list'> and length is 62.
Sample data: numbers: [5, 16, 22, 27, 9, 30, 5, 17, 11, 28] / words: ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', 'I']

CBOW and SKIP Gram

REFERENCE


In [32]:
cbow_pairs = [];
for i in range(1, len(data)-1) :
    cbow_pairs.append([[data[i-1], data[i+1]], data[i]]);
print('Context pairs: %s' % (cbow_pairs[:10]))

data[28]
# the quick brown fox jumped over the lazy dog
revierse_dic[7], revierse_dic[26], revierse_dic[28]


Context pairs: [[[5, 22], 16], [[16, 27], 22], [[22, 9], 27], [[27, 30], 9], [[9, 5], 30], [[30, 17], 5], [[5, 11], 17], [[17, 28], 11], [[11, 4], 28], [[28, 0], 4]]
Out[32]:
('great', 'we', 'I')

In [33]:
# (quick, the), (quick, brown), (brown, quick), (brown, fox), ...
# the quick brown fox jumped over the lazy dog


skip_gram_pairs = [];
for c in cbow_pairs:
    skip_gram_pairs.append([c[1], c[0][0]])
    skip_gram_pairs.append([c[1], c[0][1]])
    
print ("'skip_gram_pairs' is %s and length is %d."
       % (type(skip_gram_pairs), len(skip_gram_pairs)))
print('skip-gram pairs', skip_gram_pairs[:5])

# the quick brown fox jumped over the lazy dog
print(revierse_dic[28], revierse_dic[7])
print(revierse_dic[28], revierse_dic[26])


'skip_gram_pairs' is <class 'list'> and length is 120.
skip-gram pairs [[16, 5], [16, 22], [22, 16], [22, 27], [27, 22]]
I great
I we

In [34]:
def generate_batch(size):
    assert size < len(skip_gram_pairs)
    x_data=[]
    y_data = []
    r = np.random.choice(range(len(skip_gram_pairs)), size, replace=False)
    for i in r:
        x_data.append(skip_gram_pairs[i][0])  # n dim
        y_data.append([skip_gram_pairs[i][1]])  # n, 1 dim
    return x_data, y_data

# generate_batch test
x, y = generate_batch(3)
print ('Batches (x, y)', x,y)

print(revierse_dic[x[0]], revierse_dic[y[0][0]])
print(revierse_dic[x[1]], revierse_dic[y[1][0]])
print(revierse_dic[x[2]], revierse_dic[y[2][0]])


Batches (x, y) [29, 12, 5] [[33], [23], [17]]
It's raining
when they
the lazy

In [37]:
# Input data
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
# need to shape [batch_size, 1] for nn.nce_loss
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
# Ops and variables pinned to the CPU because of missing GPU implementation
with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs) # lookup table

# Construct the variables for the NCE loss
nce_weights = tf.Variable(
    tf.random_uniform([voc_size, embedding_size],-1.0, 1.0))
nce_biases = tf.Variable(tf.zeros([voc_size]))

# Compute the average NCE loss for the batch.
# This does the magic:
#   tf.nn.nce_loss(weights, biases, inputs, labels, num_sampled, num_classes ...)
# It automatically draws negative samples when we evaluate the loss.
loss = tf.reduce_mean(
  tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                 num_sampled, voc_size))

# Use the adam optimizer
train_op = tf.train.AdamOptimizer(1e-1).minimize(loss)

In [38]:
# Launch the graph in a session
with tf.Session() as sess:
    # Initializing all variables
    tf.initialize_all_variables().run()

    for step in range(100):
        batch_inputs, batch_labels = generate_batch(batch_size)
        _, loss_val = sess.run([train_op, loss],
                feed_dict={train_inputs: batch_inputs, train_labels: batch_labels})
        if step % 10 == 0:
          print("Loss at ", step, loss_val) # Report the loss

    # Final embeddings are ready for you to use. Need to normalize for practical use
    trained_embeddings = embeddings.eval()

# Show word2vec if dim is 2
if trained_embeddings.shape[1] == 2:
    labels = rdic[:10] # Show top 10 words
    for i, label in enumerate(labels):
        x, y = trained_embeddings[i,:]
        plt.scatter(x, y)
        plt.annotate(label, xy=(x, y), xytext=(5, 2),
            textcoords='offset points', ha='right', va='bottom')
    plt.savefig("word2vec.png")


WARNING:tensorflow:From <ipython-input-38-c84d511d3853>:5 in <module>.: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
---------------------------------------------------------------------------
InternalError                             Traceback (most recent call last)
/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1020     try:
-> 1021       return fn(*args)
   1022     except errors.OpError as e:

/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1002                                  feed_dict, fetch_list, target_list,
-> 1003                                  status, run_metadata)
   1004 

/home/teamlab/miniconda3/envs/nrf/lib/python3.5/contextlib.py in __exit__(self, type, value, traceback)
     65             try:
---> 66                 next(self.gen)
     67             except StopIteration:

/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py in raise_exception_on_not_ok_status()
    468           compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 469           pywrap_tensorflow.TF_GetCode(status))
    470   finally:

InternalError: Blas SGEMM launch failed : a.shape=(20, 2), b.shape=(15, 2), m=20, n=15, k=2
	 [[Node: nce_loss_3/MatMul_1 = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=true, _device="/job:localhost/replica:0/task:0/gpu:0"](embedding_lookup_3/_41, nce_loss_3/Slice_2)]]
	 [[Node: Adam_3/update_Variable_10/group_deps/_50 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_334_Adam_3/update_Variable_10/group_deps", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

During handling of the above exception, another exception occurred:

InternalError                             Traceback (most recent call last)
<ipython-input-38-c84d511d3853> in <module>()
      8         batch_inputs, batch_labels = generate_batch(batch_size)
      9         _, loss_val = sess.run([train_op, loss],
---> 10                 feed_dict={train_inputs: batch_inputs, train_labels: batch_labels})
     11         if step % 10 == 0:
     12           print("Loss at ", step, loss_val) # Report the loss

/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    764     try:
    765       result = self._run(None, fetches, feed_dict, options_ptr,
--> 766                          run_metadata_ptr)
    767       if run_metadata:
    768         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    962     if final_fetches or final_targets:
    963       results = self._do_run(handle, final_targets, final_fetches,
--> 964                              feed_dict_string, options, run_metadata)
    965     else:
    966       results = []

/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1012     if handle is None:
   1013       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1014                            target_list, options, run_metadata)
   1015     else:
   1016       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1032         except KeyError:
   1033           pass
-> 1034       raise type(e)(node_def, op, message)
   1035 
   1036   def _extend_graph(self):

InternalError: Blas SGEMM launch failed : a.shape=(20, 2), b.shape=(15, 2), m=20, n=15, k=2
	 [[Node: nce_loss_3/MatMul_1 = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=true, _device="/job:localhost/replica:0/task:0/gpu:0"](embedding_lookup_3/_41, nce_loss_3/Slice_2)]]
	 [[Node: Adam_3/update_Variable_10/group_deps/_50 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_334_Adam_3/update_Variable_10/group_deps", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'nce_loss_3/MatMul_1', defined at:
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-37-819b0f25fd47>", line 23, in <module>
    num_sampled, voc_size))
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/ops/nn.py", line 1336, in nce_loss
    name=name)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/ops/nn.py", line 1219, in _compute_sampled_logits
    inputs, sampled_w, transpose_b=True) + sampled_b
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py", line 1729, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/ops/gen_math_ops.py", line 1442, in _mat_mul
    transpose_b=transpose_b, name=name)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
    op_def=op_def)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/teamlab/miniconda3/envs/nrf/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
    self._traceback = _extract_stack()

InternalError (see above for traceback): Blas SGEMM launch failed : a.shape=(20, 2), b.shape=(15, 2), m=20, n=15, k=2
	 [[Node: nce_loss_3/MatMul_1 = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=true, _device="/job:localhost/replica:0/task:0/gpu:0"](embedding_lookup_3/_41, nce_loss_3/Slice_2)]]
	 [[Node: Adam_3/update_Variable_10/group_deps/_50 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_334_Adam_3/update_Variable_10/group_deps", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]