In [1]:
import tensorflow as tf, numpy as np
np.set_printoptions(precision=4, edgeitems=10, suppress=1)
import random, pickle, time
from itertools import chain
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
sns.set_context('poster')
In [6]:
with open('imdb.pkl', 'rb') as f:
save = pickle.load(f)
train = save['train']
test = save['test']
del save
with open('imdb.dict.pkl', 'rb') as f:
# word:id mapping (here id is the rank of frequency)
dictionary = pickle.load(f, encoding='utf8')
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
with open('two_dico_embeddings.pickle', 'rb') as f:
# id:embedding dictionary and word:embedding dictionary
save = pickle.load(f)
dico_embedding_id = save['dico_embedding_id']
dico_embedding_word = save['dico_embedding_word']
del save
In [7]:
MAXLEN = 500 # we consider sentences up to this length
VOCAB_SIZE = 10000 # number of different words considered
VALID_BATCH_SIZE = 500 # number of sentences for the validation set
SIZE_EMBEDDING = 50 # size of the embeddings for the input vectors
OUTPUT_SIZE = 1 # size of the prediction (0 or 1 in our case)
# ======================================================
RNN_HIDDEN = 200 # size of the LSTM hidden layer
LEARNING_RATE = 0.1 # learning rate
TINY = 1e-7 # just to avoid some weird corner case when
# computing the cross-entropy
NUM_EPOCHS = 100
ITERATONS_PER_EPOCH = 20
BATCH_SIZE = 15
In [12]:
lengths = [len(seq) for seq in train[0]]+[len(seq) for seq in test[0]]
lengths_800 = ([len(seq) for seq in train[0] if len(seq) < 800]+
[len(seq) for seq in test[0] if len(seq) < 800])
with plt.rc_context({'figure.figsize': (10, 8)}):
sns.distplot(lengths_800, kde=0, bins=100)
Out[12]:
In [15]:
def reduced_set(dataset, maxlen=MAXLEN):
new_seqs, new_labels = [], []
for seq, label in zip(dataset[0], dataset[1]):
if len(seq) <= maxlen:
new_seqs.append(seq)
new_labels.append(label)
print('new dataset length :', len(new_labels))
print('We lost %.2f%% of the data' %
((len(dataset[1]) - len(new_labels))*100/len(dataset[1]))
)
return (new_seqs, new_labels)
train_reduced = reduced_set(train, maxlen=MAXLEN)
test_reduced = reduced_set(test, maxlen=MAXLEN)
In [16]:
def generalized_set(dataset, vocab_size=VOCAB_SIZE):
return ([[i if i <= vocab_size else 1 for i in seq] for seq in dataset[0]],
dataset[1])
train_reduced2 = generalized_set(train_reduced)
test_reduced2 = generalized_set(test_reduced)
In [17]:
def generate_batch(train, batch_size=BATCH_SIZE):
seqs, labels = train
x = np.zeros((MAXLEN, batch_size, SIZE_EMBEDDING), dtype=np.float)
xmask = np.zeros((MAXLEN, batch_size, 1), dtype=np.float)
y = np.zeros((1, batch_size, 1), dtype=np.float)
idx_batch = random.sample(range(len(labels)), batch_size)
seqs_batch = [seqs[i] for i in idx_batch]
labels_batch = [labels[i] for i in idx_batch]
for j, seq in enumerate(seqs_batch):
for i in range(len(seq)):
x[i,j,:] = dico_embedding_id[seq[i]]
xmask[:len(seq),j,0] = 1
y[0,j,0] = labels_batch[j]
return x, xmask, y
In [19]:
#####################################################################
############# Graph Definition ##############
#####################################################################
with tf.Graph().as_default() as graph:
# Definition of the inputs and outputs
inputs = tf.placeholder(tf.float32, (None, None, SIZE_EMBEDDING)) # time, batch, embedding
masks = tf.placeholder(tf.float32, (None, None, 1))
labels = tf.placeholder(tf.float32, (None, None, OUTPUT_SIZE))
# Definition of the cell ;
# dropout could easily be added with tf.contrib.rnn.DropoutWrapper
cell = tf.contrib.rnn.BasicLSTMCell(RNN_HIDDEN)
# Definition of the initial state
batch_size = tf.shape(inputs)[1]
initial_state = cell.zero_state(batch_size, tf.float32)
# Computation of the outputs and states
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs,
initial_state=initial_state,
time_major=True)
## Apply the masks
rnn_outputs_masked = tf.multiply(rnn_outputs, masks)
## Averaging on all states (better than taking the last step)
final_outputs = tf.reduce_mean(rnn_outputs_masked, axis=0, keep_dims=True)
# Projection of the outputs
final_projection = lambda x: tf.contrib.layers.linear(x, num_outputs=OUTPUT_SIZE,
activation_fn=tf.nn.sigmoid)
# Application of final projection to the outputs
logits = tf.map_fn(final_projection, final_outputs)
# Loss
loss = -(labels*tf.log(logits + TINY) + (1.0 - labels)*tf.log(1.0 - logits + TINY))
loss = tf.reduce_mean(loss)
# train_optimizer
train_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(loss)
# For validation purpose
accuracy = tf.reduce_mean(tf.cast(abs(logits - labels) < 0.5, tf.float32))
In [21]:
def main(**config_dict):
set_config(config_dict=config_dict)
###########################################################################
######## Training Loop ########
###########################################################################
valid_x, valid_mask, valid_y = generate_batch(test_reduced2, batch_size=VALID_BATCH_SIZE)
with tf.Session(graph=graph) as session:
session.run(tf.global_variables_initializer())
for i in range(NUM_EPOCHS):
epoch_loss = 0
for j in range(ITERATONS_PER_EPOCH):
x, m, y = generate_batch(train_reduced2, batch_size=BATCH_SIZE)
_loss, _, train_accuracy = session.run([loss, train_optimizer, accuracy],
feed_dict={inputs:x,
masks:m,
labels:y})
epoch_loss += _loss
valid_accuracy = session.run(accuracy,
feed_dict={inputs:valid_x,
masks:valid_mask,
labels:valid_y})
print('Iteration : %d, Loss = %.8f' % (i, epoch_loss/ITERATONS_PER_EPOCH))
print('Accuracy = %.1f\n' % (valid_accuracy*100.))
# final test
test_x, test_mask, test_y = generate_batch(test_reduced2,
batch_size=min(len(test_reduced2[0]),
5000))
test_accuracy = session.run(accuracy,
feed_dict={inputs:test_x,
masks:test_mask,
labels:test_y})
print('Test Accuracy = %.1f' % (test_accuracy*100.))
In [ ]:
main()