In [1]:
import tensorflow as tf, numpy as np
np.set_printoptions(precision=4, edgeitems=10, suppress=1)
import random, pickle, time
from itertools import chain
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
sns.set_context('poster')

In [6]:
with open('imdb.pkl', 'rb') as f:
    save = pickle.load(f)
    train = save['train']
    test = save['test']
    del save
    
with open('imdb.dict.pkl', 'rb') as f:
    # word:id mapping (here id is the rank of frequency)
    dictionary = pickle.load(f, encoding='utf8')
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

with open('two_dico_embeddings.pickle', 'rb') as f:
    # id:embedding dictionary and word:embedding dictionary
    save = pickle.load(f)
    dico_embedding_id = save['dico_embedding_id']
    dico_embedding_word = save['dico_embedding_word']
    del save

Deal with Hyperparameters


In [7]:
MAXLEN = 500             # we consider sentences up to this length
VOCAB_SIZE = 10000       # number of different words considered
VALID_BATCH_SIZE = 500  # number of sentences for the validation set

SIZE_EMBEDDING = 50      # size of the embeddings for the input vectors
OUTPUT_SIZE = 1          # size of the prediction (0 or 1 in our case)

# ======================================================

RNN_HIDDEN = 200         # size of the LSTM hidden layer
LEARNING_RATE = 0.1      # learning rate

TINY = 1e-7              # just to avoid some weird corner case when 
                             # computing the cross-entropy
    
NUM_EPOCHS = 100        
ITERATONS_PER_EPOCH = 20
BATCH_SIZE = 15

In [12]:
lengths = [len(seq) for seq in train[0]]+[len(seq) for seq in test[0]]
lengths_800 = ([len(seq) for seq in train[0] if len(seq) < 800]+
               [len(seq) for seq in test[0] if len(seq) < 800])
with plt.rc_context({'figure.figsize': (10, 8)}):
    sns.distplot(lengths_800, kde=0, bins=100)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbc6937eda0>

Take or cut data to wanted length


In [15]:
def reduced_set(dataset, maxlen=MAXLEN):
    new_seqs, new_labels = [], []
    for seq, label in zip(dataset[0], dataset[1]):
        if len(seq) <= maxlen:
            new_seqs.append(seq)
            new_labels.append(label)
    print('new dataset length :', len(new_labels))
    print('We lost %.2f%% of the data' % 
          ((len(dataset[1]) - len(new_labels))*100/len(dataset[1]))
         )
    return (new_seqs, new_labels)

train_reduced = reduced_set(train, maxlen=MAXLEN)
test_reduced = reduced_set(test, maxlen=MAXLEN)


new dataset length : 22210
We lost 11.16% of the data
new dataset length : 22391
We lost 10.44% of the data

Put rare words as unknowns


In [16]:
def generalized_set(dataset, vocab_size=VOCAB_SIZE):
    return ([[i if i <= vocab_size else 1 for i in seq] for seq in dataset[0]], 
               dataset[1]) 
train_reduced2 = generalized_set(train_reduced)
test_reduced2 = generalized_set(test_reduced)

Routines pour générer les données


In [17]:
def generate_batch(train, batch_size=BATCH_SIZE):
    seqs, labels = train
    x = np.zeros((MAXLEN, batch_size, SIZE_EMBEDDING), dtype=np.float)
    xmask = np.zeros((MAXLEN, batch_size, 1), dtype=np.float)
    y = np.zeros((1, batch_size, 1), dtype=np.float)
    
    idx_batch = random.sample(range(len(labels)), batch_size)
    seqs_batch = [seqs[i] for i in idx_batch]
    labels_batch = [labels[i] for i in idx_batch]
    
    for j, seq in enumerate(seqs_batch):
        for i in range(len(seq)):
            x[i,j,:] = dico_embedding_id[seq[i]]
        xmask[:len(seq),j,0] = 1
        y[0,j,0] = labels_batch[j]
    
    return x, xmask, y

Graph building and training


In [19]:
#####################################################################
#############             Graph Definition             ##############
#####################################################################


with tf.Graph().as_default() as graph:
    
    # Definition of the inputs and outputs
    inputs = tf.placeholder(tf.float32, (None, None, SIZE_EMBEDDING)) # time, batch, embedding
    masks = tf.placeholder(tf.float32, (None, None, 1))
    labels = tf.placeholder(tf.float32, (None, None, OUTPUT_SIZE))

    # Definition of the cell ; 
    # dropout could easily be added with tf.contrib.rnn.DropoutWrapper
    cell = tf.contrib.rnn.BasicLSTMCell(RNN_HIDDEN)
    
    # Definition of the initial state
    batch_size = tf.shape(inputs)[1]
    initial_state = cell.zero_state(batch_size, tf.float32)

    # Computation of the outputs and states
    rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs, 
                                                initial_state=initial_state, 
                                                time_major=True)
    ## Apply the masks
    rnn_outputs_masked = tf.multiply(rnn_outputs, masks)

    ## Averaging on all states (better than taking the last step)
    final_outputs = tf.reduce_mean(rnn_outputs_masked, axis=0, keep_dims=True)
        

    # Projection of the outputs
    final_projection = lambda x: tf.contrib.layers.linear(x, num_outputs=OUTPUT_SIZE, 
                                                          activation_fn=tf.nn.sigmoid)
    # Application of final projection to the outputs
    logits = tf.map_fn(final_projection, final_outputs)

    # Loss
    loss = -(labels*tf.log(logits + TINY) + (1.0 - labels)*tf.log(1.0 - logits + TINY))
    loss = tf.reduce_mean(loss)

    # train_optimizer
    train_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(loss)

    # For validation purpose
    accuracy = tf.reduce_mean(tf.cast(abs(logits - labels) < 0.5, tf.float32))

In [21]:
def main(**config_dict):
    set_config(config_dict=config_dict)
    
    ###########################################################################
    ########                         Training Loop                     ########
    ###########################################################################
    valid_x, valid_mask, valid_y = generate_batch(test_reduced2, batch_size=VALID_BATCH_SIZE)

    with tf.Session(graph=graph) as session:
        session.run(tf.global_variables_initializer())
        for i in range(NUM_EPOCHS):
            epoch_loss = 0
            for j in range(ITERATONS_PER_EPOCH):
                x, m, y = generate_batch(train_reduced2, batch_size=BATCH_SIZE)
                _loss, _, train_accuracy = session.run([loss, train_optimizer, accuracy], 
                                                        feed_dict={inputs:x, 
                                                                   masks:m,
                                                                   labels:y})
                epoch_loss += _loss
                
            valid_accuracy = session.run(accuracy, 
                                         feed_dict={inputs:valid_x, 
                                                    masks:valid_mask,
                                                    labels:valid_y})
            print('Iteration : %d, Loss = %.8f' % (i, epoch_loss/ITERATONS_PER_EPOCH))
            print('Accuracy = %.1f\n' % (valid_accuracy*100.))

        # final test
        test_x, test_mask, test_y = generate_batch(test_reduced2, 
                                                       batch_size=min(len(test_reduced2[0]), 
                                                                      5000))
        test_accuracy = session.run(accuracy, 
                                    feed_dict={inputs:test_x, 
                                               masks:test_mask, 
                                               labels:test_y})
        print('Test Accuracy = %.1f' % (test_accuracy*100.))

In [ ]:
main()