In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import csv
import itertools
import time
In [2]:
reddit_file = '/home/huizhu/git_test/rnn-tutorial-rnnlm/data/reddit-comments-2015-08.csv'
df_reddit = pd.read_csv(reddit_file)
df_reddit.head()
Out[2]:
In [3]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
# Read the data and append SENTENCE_START and SENTENCE_END tokens
print "Reading CSV file..."
with open(reddit_file, 'rb') as f:
reader = csv.reader(f, skipinitialspace=True)
reader.next()
# Split full comments into sentences
sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
# Append SENTENCE_START and SENTENCE_END
sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print "Parsed %d sentences." % (len(sentences))
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print "Found %d unique words tokens." % len(word_freq.items())
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
In [4]:
X_train.shape, y_train.shape
Out[4]:
In [5]:
print(X_train[0])
print(y_train[0])
From the above, we see the first sentense is
x = [0, 6, 3513, 7, 155, 794, 25, 223, 8, 32, 20, 202, 5025, 350, 91, 6, 66, 207, 5, 2]
where each number represent one word, and we will vectorize them later (i.e. $x_t$).
And
y = [6, 3513, 7, 155, 794, 25, 223, 8, 32, 20, 202, 5025, 350, 91, 6, 66, 207, 5, 2, 1]
that is to say, our model will generate the next word conditioned on the previous words.
The formulars are:
$ \begin{aligned} s_t &= \tanh(Ux_t + Ws_{t-1}) \\ o_t &= \mathrm{softmax}(Vs_t) \end{aligned} $
In [6]:
def softmax2D(x):
assert len(x.shape) == 2, 'x.shape: {0} do not match.'.format(x.shape)
x = np.asarray(x, dtype='float32')
dim = x.shape[1]
x_tile = x[:, :, np.newaxis] # expand the last dim
x_tile = np.tile(x_tile, (1, 1, dim)) # repeat the last dim
x_sub = x[:, np.newaxis, :] # expand the second dim
x_out = x_tile - x_sub
g = 1. / np.sum(np.exp(-x_out), axis=1)
return g
def softmax(x):
assert isinstance(x, np.ndarray), 'type: {0} do not match.'.format(type(x))
if len(x.shape) == 1:
x_new = np.reshape(x, (1, -1))
g = softmax2D(x_new)
g = g.ravel()
elif len(x.shape) == 2:
g = softmax2D(x)
else:
raise ValueError('Input array have x.shape == {0}, which is not allowed.'.format(x.shape))
return g
class RNNNumpy(object):
def __init__(self, word_dim, hidden_dim, bptt_truncate=4):
t1 = time.time()
self.word_dim = word_dim
self.hidden_dim = hidden_dim
self.bptt_truncate = bptt_truncate
s_1 = np.sqrt(1./word_dim)
s_2 = np.sqrt(1./hidden_dim)
self.U = np.random.uniform(-s_1, s_1, size=(hidden_dim, word_dim))
self.W = np.random.uniform(-s_2, s_2, size=(hidden_dim, hidden_dim))
self.V = np.random.uniform(-s_2, s_2, size=(word_dim, hidden_dim))
t2 = time.time()
print('Initial time: {0}'.format(t2 - t1))
def forward_prop(self, x):
t1 = time.time()
T = len(x)
s = np.zeros((T + 1, self.hidden_dim), dtype='float32')
o = np.zeros((T, self.word_dim), dtype='float32')
for t in range(T):
s[t] = np.tanh(self.U[:, x[t]] + np.dot(self.W, s[t-1]))
o[t] = softmax(np.dot(self.V, s[t]))
t2 = time.time()
print('forward time: {0}'.format(t2 - t1))
return o, s
def predict(self, x):
o, s = self.forward_prop(x)
y_pred = np.argmax(o, axis=1)
return y_pred
In [7]:
rnn = RNNNumpy(vocabulary_size, 100)
print(X_train[0])
o_sent, s_sent = rnn.forward_prop(X_train[0])
#print(o_sent.shape, s_sent.shape)
In [13]:
print(o_sent.shape, s_sent.shape)
In [9]:
print([len(x) for x in X_train[:10]])
print([len(y) for y in y_train[:10]])
In [14]:
max_sent_len = 30
feat_dim = 100
dtype_INT = tf.int32
dtype_FLOAT = tf.float32
input_tensor = tf.placeholder(shape=(None, max_sent_len), dtype=dtype_INT)
In [19]:
class RNNTF(object):
def __init__(self, embedded_words, ):
pass
def embedding(input_tensor, vocab_size, word_dim):
"""
inputs
-------
input_tensor: 2D tensor with shape=(batch_size, max_sentence_len), indices of words
vocab_size: integer, total num of unqiue words, max(input_tensor) == vocab_size - 1
word_dim: integer, length of vectorized word
return
-------
embedded_words: 3D tensor with shape=(batch_size, max_sentence_len, word_dim)
"""
shape = (vocab_size, word_dim)
scale = np.sqrt(1./word_dim)
initializer = tf.random_uniform_initializer(-scale, scale, dtype=dtype_FLOAT)
with tf.variable_scope('embedding'):
params = tf.get_variable('params', shape, initializer=initializer)
embedded_words = tf.gather(params, input_tensor)
return embedded_words
In [20]:
embedded_words = embedding(input_tensor, max_sent_len, feat_dim)
embedded_words
Out[20]:
In [21]:
unstack_sent = tf.unstack(embedded_words)
unstack_sent
In [ ]: