In [1]:
import tensorflow as tf, numpy as np
np.set_printoptions(precision=4, edgeitems=10, suppress=1)
import random, pickle, time, os
from itertools import chain
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
sns.set_context('poster')

In [2]:
with open('imdb.pkl', 'rb') as f:
    save = pickle.load(f)
    train = save['train']
    test = save['test']
    del save
with open('imdb.dict.pkl', 'rb') as f:
    dictionary = pickle.load(f, encoding='utf8')
with open('two_dico_embeddings.pickle', 'rb') as f:
    save = pickle.load(f)
    dico_embedding_id = save['dico_embedding_id']
    dico_embedding_word = save['dico_embedding_word']

reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

Deal with Hyperparameters


In [3]:
MAXLEN = 500             # we consider sentences up to this length
NO_CUT = 1               # if the sentence's length is greater than MAXLEN, and if NO_CUT, 
                             # then we do not consider it. If not NO_CUT, we cut it 
                             # and keep it up to this length
VOCAB_SIZE = 10000       # number of different words considered
VALID_BATCH_SIZE = 100  # number of sentences for the validation set

SIZE_EMBEDDING = 50      # size of the embeddings for the input vectors
OUTPUT_SIZE = 1          # size of the prediction (0 or 1 in our case)

# ======================================================

RNN_HIDDEN = 200         # size of the LSTM hidden layer
LEARNING_RATE = 0.003      # learning rate

DROPOUT = 0.5            # we apply dropout if DROPOUT > 0 . 
DROPOUT = 0                  # DROPOUT is the probability we drop the output


TINY = 1e-7              # just to avoid some weird corner case when 
                             # computing the cross-entropy
DO_TEST = 1              # do we do the final evaluation of the model or not
    
NUM_EPOCHS = 100        
ITERATONS_PER_EPOCH = 15
BATCH_SIZE = 20

In [4]:
def set_config(config_dict=None, **kwargs):
    """Modify hyperparameters'values as simple as a call `set_config(VARNAME=value)`, or 
    with a complete dict passed"""
    for varname, value in kwargs.items():
        globals()[varname] = value
    if config_dict is not None:
        for varname, value in config_dict.items():
            globals()[varname] = value
        
def print_config():
    """Print the current hyperparameters config"""
    hypers = [var for var in globals().keys() if var.isupper()]
    for h in hypers:
        print(h.rjust(20), '=', eval(h))
        
def get_config():
    return dict([(var, val) for var, val in globals().items() if var.isupper()])

In [5]:
print_config()


      SIZE_EMBEDDING = 50
             DO_TEST = 1
             DROPOUT = 0
          VOCAB_SIZE = 10000
         OUTPUT_SIZE = 1
              MAXLEN = 500
          BATCH_SIZE = 20
       LEARNING_RATE = 0.003
              NO_CUT = 1
 ITERATONS_PER_EPOCH = 15
          NUM_EPOCHS = 100
    VALID_BATCH_SIZE = 100
          RNN_HIDDEN = 200
                TINY = 1e-07

In [6]:
lengths = [len(seq) for seq in train[0]]+[len(seq) for seq in test[0]]
lengths_500 = ([len(seq) for seq in train[0] if len(seq) < 500]+
               [len(seq) for seq in test[0] if len(seq) < 500])
with plt.rc_context({'figure.figsize': (10, 8)}):
    sns.distplot(lengths_500, kde=0, bins=100)


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa8da5fe3c8>

Take or cut data to wanted length


In [7]:
def reduced_set(dataset, maxlen=MAXLEN, no_cut=NO_CUT):
    new_seqs, new_labels = [], []
    for seq, label in zip(dataset[0], dataset[1]):
        if len(seq) > maxlen:
            if not NO_CUT:
                new_seqs.append(seq)
                new_labels.append(label)
        else:
            new_seqs.append(seq)
            new_labels.append(label)
    print('new dataset length :', len(new_labels))
    print('On a perdu %.2f%% des exemples' % 
          ((len(dataset[1]) - len(new_labels))*100/len(dataset[1]))
         )
    return (new_seqs, new_labels)

train_reduced = reduced_set(train, maxlen=MAXLEN)
test_reduced = reduced_set(test, maxlen=MAXLEN)


new dataset length : 22210
On a perdu 11.16% des exemples
new dataset length : 22391
On a perdu 10.44% des exemples

Put rare words as unknowns


In [8]:
def generalized_set(dataset, vocab_size=VOCAB_SIZE):
    return ([[i if i <= vocab_size else 1 for i in seq] for seq in dataset[0]], 
               dataset[1]) 
train_reduced2 = generalized_set(train_reduced)
test_reduced2 = generalized_set(test_reduced)

Routines pour générer les données


In [9]:
def generate_batch(train, batch_size=BATCH_SIZE):
    seqs, labels = train
    x = np.zeros((MAXLEN, batch_size, SIZE_EMBEDDING), dtype=np.float)
    xmask = np.zeros((MAXLEN, batch_size, 1), dtype=np.float)
    y = np.zeros((1, batch_size, 1), dtype=np.float)
    
    idx_batch = random.sample(range(len(labels)), batch_size)
    seqs_batch = [seqs[i] for i in idx_batch]
    labels_batch = [labels[i] for i in idx_batch]
    
    for j, seq in enumerate(seqs_batch):
        for i in range(len(seq)):
            x[i,j,:] = dico_embedding_id[seq[i]]
        xmask[:len(seq),j,0] = 1
        y[0,j,0] = labels_batch[j]
    
    return x, xmask, y

Graph building and training


In [10]:
def main(name=None, sets=(train, test), **config_dict):
    set_config(config_dict=config_dict)
    
    #####################################################################
    #############             Sets Preparation             ##############
    #####################################################################
    train, test = sets
    train = generalized_set(reduced_set(train))
    test = generalized_set(reduced_set(test))
    
    #####################################################################
    #############             Graph Definition             ##############
    #####################################################################


    with tf.Graph().as_default() as graph:

        # Definition of the inputs and outputs
        inputs = tf.placeholder(tf.float32, (None, None, SIZE_EMBEDDING)) # time, batch, emb
        masks = tf.placeholder(tf.float32, (None, None, 1))
        labels = tf.placeholder(tf.float32, (None, None, OUTPUT_SIZE))
        _is_training = tf.Variable(initial_value=True, trainable=False)


        cell = tf.contrib.rnn.BasicLSTMCell(RNN_HIDDEN)

        # maybe add dropout
        if DROPOUT and _is_training:
            cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=DROPOUT, 
                                                 output_keep_prob=DROPOUT)

        # Definition of the initial state
        batch_size = tf.shape(inputs)[1]
        initial_state = cell.zero_state(batch_size, tf.float32)

        # Computation of the outputs and states
        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs, 
                                                    initial_state=initial_state, 
                                                    time_major=True)
        ## Apply the masks
        rnn_outputs_masked = tf.multiply(rnn_outputs, masks)

        ## Averaging on all states (better than taking the last step)
        final_outputs = tf.reduce_mean(rnn_outputs_masked, axis=0, keep_dims=True)


        # Projection of the outputs
        final_projection = lambda x: tf.contrib.layers.linear(x, num_outputs=OUTPUT_SIZE, 
                                                              activation_fn=tf.nn.sigmoid)
        # Application of final projection to the outputs
        logits = tf.map_fn(final_projection, final_outputs)

        # Loss
        loss = -(labels*tf.log(logits + TINY) + (1.0 - labels)*tf.log(1.0 - logits + TINY))
        loss = tf.reduce_mean(loss)

        # train_optimizer
        train_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(loss)

        # For validation purpose
        accuracy = tf.reduce_mean(tf.cast(abs(logits - labels) < 0.5, tf.float32))
        
    
    ###########################################################################
    ########                         Training Loop                     ########
    ###########################################################################
    logs = defaultdict(list)

    valid_x, valid_mask, valid_y = generate_batch(test_reduced2, batch_size=VALID_BATCH_SIZE)

    with tf.Session(graph=graph) as session:
        session.run(tf.global_variables_initializer())
        for i in range(NUM_EPOCHS):
            epoch_loss = 0
            for j in range(ITERATONS_PER_EPOCH):
                x, m, y = generate_batch(train_reduced2, batch_size=BATCH_SIZE)
                _loss, _, train_accuracy = session.run([loss, train_optimizer, accuracy], 
                                                        feed_dict={inputs:x, 
                                                                   masks:m,
                                                                   labels:y, 
                                                                   _is_training:True})
                epoch_loss += _loss
                
            valid_accuracy = session.run(accuracy, 
                                         feed_dict={inputs:valid_x, 
                                                    masks:valid_mask,
                                                    labels:valid_y, 
                                                    _is_training:False})
            print('Iteration : %d, Loss = %.8f' % (i, epoch_loss/ITERATONS_PER_EPOCH))
            print('Accuracy = %.1f\n' % (valid_accuracy*100.))
            logs['loss'].append(epoch_loss)
            logs['train_accuracy'].append(train_accuracy)
            logs['valid_accuracy'].append(valid_accuracy)

        if DO_TEST:
            # Here comes the final test ; I split the test batch in sub_batch, otherwise
            # my RAM (8GB) wasn't enough
            TEST_SIZE = 8000
            NB_TESTS = 50
            test_x, test_mask, test_y = generate_batch(test_reduced2, 
                                                       batch_size=min(len(test_reduced2[0]), 
                                                                      TEST_SIZE))
            tsize = TEST_SIZE//NB_TESTS
            results = np.zeros(NB_TESTS, dtype=np.float64)
            for i in range(NB_TESTS):
                sub_test_x = test_x[:,i*tsize:(i*tsize+tsize),:]
                sub_test_mask = test_mask[:,i*tsize:(i*tsize+tsize),:]
                sub_test_y = test_y[:,i*tsize:(i*tsize+tsize),:]
                sub_test_acc = session.run(accuracy, 
                                           feed_dict={inputs:sub_test_x, 
                                                      masks:sub_test_mask, 
                                                      labels:sub_test_y, 
                                                      _is_training:False})
                results[i] = sub_test_acc*tsize
                print(i)
            test_accuracy = results.sum()/TEST_SIZE
            print('Test Accuracy = %.1f' % (test_accuracy*100.))
            logs['test_accuracy'] = test_accuracy
            logs['config'] = get_config()
    pickle_name = name+"__"+str(config_dict) if (name is not None) else str(config_dict)
    with open(os.path.join('logs', pickle_name+'.pickle'), 'wb') as f:
        pickle.dump(logs, f)
    print("logs \"%s\" pickled" % name)
main(name='standard')
main(name='cut', NO_CUT=0, DROPOUT=0, RNN_HIDDEN=200)
main(name='vocab=5000', NO_CUT=1, VOCAB_SIZE=5000)
main(name='vocab=1000', VOCAB_SIZE=1000)
main(name='dropout', DROPOUT=0.5, VOCAB_SIZE=10000, RNN_HIDDEN=600, NUM_EPOCHS=100)
main(name='dropout-400', DROPOUT=0.5, RNN_HIDDEN=400)

In [19]:
main(name='dropout-200', DROPOUT=0.5, RNN_HIDDEN=200)


new dataset length : 22210
On a perdu 11.16% des exemples
new dataset length : 22391
On a perdu 10.44% des exemples
Iteration : 0, Loss = 0.70273316
Accuracy = 51.0

Iteration : 1, Loss = 0.69295950
Accuracy = 62.0

Iteration : 2, Loss = 0.67874727
Accuracy = 51.0

Iteration : 3, Loss = 0.66985942
Accuracy = 65.0

Iteration : 4, Loss = 0.63485666
Accuracy = 63.0

Iteration : 5, Loss = 0.66547759
Accuracy = 68.0

Iteration : 6, Loss = 0.64898441
Accuracy = 66.0

Iteration : 7, Loss = 0.65170310
Accuracy = 64.0

Iteration : 8, Loss = 0.65686649
Accuracy = 68.0

Iteration : 9, Loss = 0.63622556
Accuracy = 63.0

Iteration : 10, Loss = 0.63087281
Accuracy = 61.0

Iteration : 11, Loss = 0.60892094
Accuracy = 63.0

Iteration : 12, Loss = 0.58841647
Accuracy = 70.0

Iteration : 13, Loss = 0.60529889
Accuracy = 68.0

Iteration : 14, Loss = 0.58428517
Accuracy = 67.0

Iteration : 15, Loss = 0.61338459
Accuracy = 74.0

Iteration : 16, Loss = 0.57982760
Accuracy = 71.0

Iteration : 17, Loss = 0.59030207
Accuracy = 66.0

Iteration : 18, Loss = 0.61560283
Accuracy = 69.0

Iteration : 19, Loss = 0.56724476
Accuracy = 69.0

Iteration : 20, Loss = 0.60448306
Accuracy = 69.0

Iteration : 21, Loss = 0.53000749
Accuracy = 75.0

Iteration : 22, Loss = 0.54754212
Accuracy = 73.0

Iteration : 23, Loss = 0.54793184
Accuracy = 76.0

Iteration : 24, Loss = 0.56133572
Accuracy = 77.0

Iteration : 25, Loss = 0.58006219
Accuracy = 73.0

Iteration : 26, Loss = 0.55342856
Accuracy = 70.0

Iteration : 27, Loss = 0.56053991
Accuracy = 70.0

Iteration : 28, Loss = 0.50456653
Accuracy = 64.0

Iteration : 29, Loss = 0.53177011
Accuracy = 70.0

Iteration : 30, Loss = 0.53809770
Accuracy = 74.0

Iteration : 31, Loss = 0.55486096
Accuracy = 74.0

Iteration : 32, Loss = 0.57650455
Accuracy = 75.0

Iteration : 33, Loss = 0.50357399
Accuracy = 78.0

Iteration : 34, Loss = 0.51404161
Accuracy = 73.0

Iteration : 35, Loss = 0.51532975
Accuracy = 73.0

Iteration : 36, Loss = 0.44022499
Accuracy = 74.0

Iteration : 37, Loss = 0.65205117
Accuracy = 68.0

Iteration : 38, Loss = 0.57769157
Accuracy = 72.0

Iteration : 39, Loss = 0.52176053
Accuracy = 68.0

Iteration : 40, Loss = 0.57294675
Accuracy = 70.0

Iteration : 41, Loss = 0.51843280
Accuracy = 76.0

Iteration : 42, Loss = 0.56552632
Accuracy = 72.0

Iteration : 43, Loss = 0.54391804
Accuracy = 79.0

Iteration : 44, Loss = 0.47784349
Accuracy = 75.0

Iteration : 45, Loss = 0.48359947
Accuracy = 74.0

Iteration : 46, Loss = 0.54126767
Accuracy = 74.0

Iteration : 47, Loss = 0.47913333
Accuracy = 71.0

Iteration : 48, Loss = 0.48697842
Accuracy = 75.0

Iteration : 49, Loss = 0.49223171
Accuracy = 71.0

Iteration : 50, Loss = 0.51565799
Accuracy = 79.0

Iteration : 51, Loss = 0.47508008
Accuracy = 75.0

Iteration : 52, Loss = 0.48437018
Accuracy = 69.0

Iteration : 53, Loss = 0.51895565
Accuracy = 79.0

Iteration : 54, Loss = 0.45901849
Accuracy = 73.0

Iteration : 55, Loss = 0.48698364
Accuracy = 76.0

Iteration : 56, Loss = 0.49798693
Accuracy = 75.0

Iteration : 57, Loss = 0.46302055
Accuracy = 81.0

Iteration : 58, Loss = 0.48636708
Accuracy = 72.0

Iteration : 59, Loss = 0.53315408
Accuracy = 75.0

Iteration : 60, Loss = 0.44734023
Accuracy = 80.0

Iteration : 61, Loss = 0.49542241
Accuracy = 68.0

Iteration : 62, Loss = 0.50223312
Accuracy = 78.0

Iteration : 63, Loss = 0.48167020
Accuracy = 77.0

Iteration : 64, Loss = 0.45346881
Accuracy = 78.0

Iteration : 65, Loss = 0.47606792
Accuracy = 72.0

Iteration : 66, Loss = 0.45719924
Accuracy = 72.0

Iteration : 67, Loss = 0.48150486
Accuracy = 77.0

Iteration : 68, Loss = 0.47600580
Accuracy = 74.0

Iteration : 69, Loss = 0.51677332
Accuracy = 76.0

Iteration : 70, Loss = 0.51419432
Accuracy = 73.0

Iteration : 71, Loss = 0.46550310
Accuracy = 80.0

Iteration : 72, Loss = 0.45826542
Accuracy = 73.0

Iteration : 73, Loss = 0.48370690
Accuracy = 78.0

Iteration : 74, Loss = 0.47124236
Accuracy = 72.0

Iteration : 75, Loss = 0.45667866
Accuracy = 71.0

Iteration : 76, Loss = 0.45997865
Accuracy = 74.0

Iteration : 77, Loss = 0.42874323
Accuracy = 80.0

Iteration : 78, Loss = 0.50419249
Accuracy = 75.0

Iteration : 79, Loss = 0.46212334
Accuracy = 76.0

Iteration : 80, Loss = 0.48520953
Accuracy = 76.0

Iteration : 81, Loss = 0.48780722
Accuracy = 75.0

Iteration : 82, Loss = 0.45386540
Accuracy = 80.0

Iteration : 83, Loss = 0.50363068
Accuracy = 75.0

Iteration : 84, Loss = 0.45110379
Accuracy = 73.0

Iteration : 85, Loss = 0.49921503
Accuracy = 81.0

Iteration : 86, Loss = 0.43712987
Accuracy = 77.0

Iteration : 87, Loss = 0.44570413
Accuracy = 84.0

Iteration : 88, Loss = 0.45326492
Accuracy = 80.0

Iteration : 89, Loss = 0.40862830
Accuracy = 75.0

Iteration : 90, Loss = 0.42203861
Accuracy = 74.0

Iteration : 91, Loss = 0.46262227
Accuracy = 80.0

Iteration : 92, Loss = 0.41737071
Accuracy = 75.0

Iteration : 93, Loss = 0.48876720
Accuracy = 73.0

Iteration : 94, Loss = 0.42223377
Accuracy = 80.0

Iteration : 95, Loss = 0.45822909
Accuracy = 76.0

Iteration : 96, Loss = 0.38258452
Accuracy = 75.0

Iteration : 97, Loss = 0.43776881
Accuracy = 74.0

Iteration : 98, Loss = 0.44898683
Accuracy = 78.0

Iteration : 99, Loss = 0.48205647
Accuracy = 77.0

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
Test Accuracy = 79.2
logs "dropout-200" pickled

In [21]:
with open("logs/standard__{}.pickle", "rb") as f:
    logs_no_dropout = pickle.load(f)
with open("logs/dropout-200__{'DROPOUT': 0.5, 'RNN_HIDDEN': 200}.pickle", "rb") as f:
    logs_dropout_200 = pickle.load(f)
with open("logs/dropout-400__{'DROPOUT': 0.5, 'RNN_HIDDEN': 400}.pickle", "rb") as f:
    logs_dropout_400 = pickle.load(f)
with open("logs/dropout__{'VOCAB_SIZE': 10000, 'RNN_HIDDEN': 500, 'DROPOUT': 0.5, 'NUM_EPOCHS': 100}.pickle", "rb") as f:
    logs_dropout_500 = pickle.load(f)
with open("logs/dropout__{'DROPOUT': 0.5, 'RNN_HIDDEN': 600, 'VOCAB_SIZE': 10000, 'NUM_EPOCHS': 100}.pickle", "rb") as f:
    logs_dropout_600 = pickle.load(f)

In [22]:
logs = logs_no_dropout
logs2 = logs_dropout_200
logs3 = logs_dropout_400
logs4 = logs_dropout_500
logs5 = logs_dropout_600
supp = range(ITERATONS_PER_EPOCH, NUM_EPOCHS*ITERATONS_PER_EPOCH+1, ITERATONS_PER_EPOCH)
with plt.rc_context({'figure.figsize': (14, 10)}):
    fig, ax = plt.subplots()
    l2 = ax.plot(supp, logs['valid_accuracy'], color='#99e699')
    l2_1 = ax.plot(supp, logs2['valid_accuracy'], color='#33cc33')
    l2_2 = ax.plot(supp, logs3['valid_accuracy'], color='#1f7a1f')
    l2_3 = ax.plot(supp, logs4['valid_accuracy'], color='#264d00')
    l2_4 = ax.plot(supp, logs5['valid_accuracy'], color='#1a1a1a')
    
    # on affiche le score final au test
    l4 = ax.scatter(supp[-1], logs['test_accuracy'], color="#99e699")
    l4_1 = ax.scatter(supp[-1], logs2['test_accuracy'], color="#33cc33")
    l4_2 = ax.scatter(supp[-1], logs3['test_accuracy'], color="#1f7a1f")
    l4_3 = ax.scatter(supp[-1], logs3['test_accuracy'], color="#264d00")
    l4_4 = ax.scatter(supp[-1], logs4['test_accuracy'], color="#1a1a1a")
    
    ax.set_xlabel('Itérations')
    ax.set_ylabel('Exactitudes en ratio')
    ax.legend(l2+l2_1+l2_2+l2_3+l2_4+[l4,l4_1,l4_2,l4_3, l4_4], 
              ['exactitude de validation sans dropout', "dropout, RNN_HIDDEN=200", 
               "dropout, RNN_HIDDEN=400", "dropout, RNN_HIDDEN=500", 
               "dropout, RNN_HIDDEN=600", 
               'coût sans dropout', "dropout, RNN_HIDDEN=200", "dropout, RNN_HIDDEN=400", 
               'dropout, RNN_HIDDEN=500', "dropout, RNN_HIDDEN=600", 
               'exactitude de test sans dropout', "dropout, RNN_HIDDEN=400", 
               "dropout, RNN_HIDDEN=400", 'dropout, RNN_HIDDEN=500', 
               "dropout, RNN_HIDDEN=600"], 
              loc='lower left', 
              frameon=True)
    frame = ax.legend_.get_frame()
    frame.set_facecolor('white')
    frame.set_edgecolor('black')
1;



In [25]:
logs = logs_no_dropout
logs2 = logs_dropout_400
logs3 = logs_dropout_500
logs4 = logs_dropout_600
supp = range(ITERATONS_PER_EPOCH, NUM_EPOCHS*ITERATONS_PER_EPOCH+1, ITERATONS_PER_EPOCH)
with plt.rc_context({'figure.figsize': (14, 10)}):
    fig, ax = plt.subplots()    
    ax.set_xlabel('Itérations')
    ax.set_ylabel("Fonction coût")
    with sns.axes_style('white'):
        ax2 = ax.twinx()
        l3 = ax2.plot(supp, logs['loss'], color='#ff9980')
        l33 = ax2.plot(supp, logs2['loss'], color='#ff3300')
        l333 = ax2.plot(supp, logs3['loss'], color='#991f00')
        l3333 = ax2.plot(supp, logs4['loss'], color='#1a1a1a')
    ax2.legend(l3+l33+l333+l3333, 
              ['coût sans dropout', "dropout, RNN_HIDDEN=400", 'dropout, RNN_HIDDEN=500',
               "dropout, RNN_HIDDEN=600"], 
              loc='lower left', 
              frameon=True)
    frame = ax2.legend_.get_frame()
    frame.set_facecolor('white')
    frame.set_edgecolor('black')
1;



In [ ]: