In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import csv
import itertools
import time

In [2]:
reddit_file = '/home/huizhu/git_test/rnn-tutorial-rnnlm/data/reddit-comments-2015-08.csv'
df_reddit = pd.read_csv(reddit_file)
df_reddit.head()


Out[2]:
body
0 I joined a new league this year and they have ...
1 In your scenario, a person could just not run ...
2 They don't get paid for how much time you spen...
3 I dunno, back before the August update in an A...
4 No, but Toriyama sometimes would draw himself ...

In [3]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
 
# Read the data and append SENTENCE_START and SENTENCE_END tokens
print "Reading CSV file..."
with open(reddit_file, 'rb') as f:
    reader = csv.reader(f, skipinitialspace=True)
    reader.next()
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print "Parsed %d sentences." % (len(sentences))
     
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
 
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique words tokens." % len(word_freq.items())
 
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
 
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])
 
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
 
print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]
 
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])


Reading CSV file...
Parsed 79170 sentences.
Found 65751 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'devoted' and appeared 10 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '[u'SENTENCE_START', u'i', u'joined', u'a', u'new', u'league', u'this', u'year', u'and', u'they', u'have', u'different', u'scoring', u'rules', u'than', u'i', u"'m", u'used', u'to', u'.', u'SENTENCE_END']'

In [4]:
X_train.shape, y_train.shape


Out[4]:
((79170,), (79170,))

In [5]:
print(X_train[0])
print(y_train[0])


[0, 6, 3513, 7, 155, 794, 25, 223, 8, 32, 20, 202, 5025, 350, 91, 6, 66, 207, 5, 2]
[6, 3513, 7, 155, 794, 25, 223, 8, 32, 20, 202, 5025, 350, 91, 6, 66, 207, 5, 2, 1]

From the above, we see the first sentense is

x = [0, 6, 3513, 7, 155, 794, 25, 223, 8, 32, 20, 202, 5025, 350, 91, 6, 66, 207, 5, 2]

where each number represent one word, and we will vectorize them later (i.e. $x_t$).

And

y = [6, 3513, 7, 155, 794, 25, 223, 8, 32, 20, 202, 5025, 350, 91, 6, 66, 207, 5, 2, 1]

that is to say, our model will generate the next word conditioned on the previous words.

The formulars are:

$ \begin{aligned} s_t &= \tanh(Ux_t + Ws_{t-1}) \\ o_t &= \mathrm{softmax}(Vs_t) \end{aligned} $


In [6]:
def softmax2D(x):
    assert len(x.shape) == 2, 'x.shape: {0} do not match.'.format(x.shape)
    x = np.asarray(x, dtype='float32')
    dim = x.shape[1]
    x_tile = x[:, :, np.newaxis] # expand the last dim
    x_tile = np.tile(x_tile, (1, 1, dim)) # repeat the last dim
    
    x_sub = x[:, np.newaxis, :] # expand the second dim
    x_out = x_tile - x_sub
    
    g = 1. / np.sum(np.exp(-x_out), axis=1)
    return g

def softmax(x):
    assert isinstance(x, np.ndarray), 'type: {0} do not match.'.format(type(x))
    
    if len(x.shape) == 1:
        x_new = np.reshape(x, (1, -1))
        g = softmax2D(x_new)
        g = g.ravel()
    elif len(x.shape) == 2:
        g = softmax2D(x)
    else:
        raise ValueError('Input array have x.shape == {0}, which is not allowed.'.format(x.shape))
    return g

class RNNNumpy(object):
    def __init__(self, word_dim, hidden_dim, bptt_truncate=4):
        t1 = time.time()
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        
        s_1 = np.sqrt(1./word_dim)
        s_2 = np.sqrt(1./hidden_dim)
        self.U = np.random.uniform(-s_1, s_1, size=(hidden_dim, word_dim))
        self.W = np.random.uniform(-s_2, s_2, size=(hidden_dim, hidden_dim))
        self.V = np.random.uniform(-s_2, s_2, size=(word_dim, hidden_dim))
        t2 = time.time()
        print('Initial time: {0}'.format(t2 - t1))
    
    def forward_prop(self, x):
        t1 = time.time()
        T = len(x)
        s = np.zeros((T + 1, self.hidden_dim), dtype='float32')
        o = np.zeros((T, self.word_dim), dtype='float32')
        for t in range(T):
            s[t] = np.tanh(self.U[:, x[t]] + np.dot(self.W, s[t-1]))
            o[t] = softmax(np.dot(self.V, s[t]))
        t2 = time.time()
        print('forward time: {0}'.format(t2 - t1))
        return o, s
    
    def predict(self, x):
        o, s = self.forward_prop(x)
        y_pred = np.argmax(o, axis=1)
        return y_pred

In [7]:
rnn = RNNNumpy(vocabulary_size, 100)
print(X_train[0])
o_sent, s_sent = rnn.forward_prop(X_train[0])
#print(o_sent.shape, s_sent.shape)


Initial time: 0.0908930301666
[0, 6, 3513, 7, 155, 794, 25, 223, 8, 32, 20, 202, 5025, 350, 91, 6, 66, 207, 5, 2]
forward time: 34.0505671501

In [13]:
print(o_sent.shape, s_sent.shape)


((20, 8000), (21, 100))

In [9]:
print([len(x) for x in X_train[:10]])
print([len(y) for y in y_train[:10]])


[20, 10, 28, 18, 24, 15, 27, 9, 25, 16]
[20, 10, 28, 18, 24, 15, 27, 9, 25, 16]

In [14]:
max_sent_len = 30
feat_dim = 100
dtype_INT = tf.int32
dtype_FLOAT = tf.float32
input_tensor = tf.placeholder(shape=(None, max_sent_len), dtype=dtype_INT)

In [19]:
class RNNTF(object):
    def __init__(self, embedded_words, ):
        pass

    
def embedding(input_tensor, vocab_size, word_dim):
    """
    inputs
    -------
    input_tensor: 2D tensor with shape=(batch_size, max_sentence_len), indices of words
    vocab_size: integer, total num of unqiue words, max(input_tensor) == vocab_size - 1
    word_dim: integer, length of vectorized word
    
    return
    -------
    embedded_words: 3D tensor with shape=(batch_size, max_sentence_len, word_dim)
    """
    shape = (vocab_size, word_dim)
    scale = np.sqrt(1./word_dim)
    initializer = tf.random_uniform_initializer(-scale, scale, dtype=dtype_FLOAT)
    
    with tf.variable_scope('embedding'):
        params = tf.get_variable('params', shape, initializer=initializer)
    embedded_words = tf.gather(params, input_tensor)
    
    return embedded_words

In [20]:
embedded_words = embedding(input_tensor, max_sent_len, feat_dim)
embedded_words


Out[20]:
<tf.Tensor 'Gather_1:0' shape=(?, 30, 100) dtype=float32>

In [21]:
unstack_sent = tf.unstack(embedded_words)
unstack_sent


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-c29593b888e9> in <module>()
----> 1 unstack_sent = tf.unstack(embedded_words)
      2 unstack_sent

/home/huizhu/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.pyc in unstack(value, num, axis, name)
    976       num = value_shape[axis].value
    977   if num is None:
--> 978     raise ValueError("Cannot infer num from shape %s" % value_shape)
    979   return gen_array_ops._unpack(value, num=num, axis=axis, name=name)
    980 

ValueError: Cannot infer num from shape (?, 30, 100)

In [ ]: