In [1]:
import tensorflow as tf, numpy as np
np.set_printoptions(precision=4, edgeitems=10, suppress=1)
import random, pickle, time, os
from itertools import chain
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
sns.set_context('poster')
In [2]:
with open('imdb.pkl', 'rb') as f:
save = pickle.load(f)
train = save['train']
test = save['test']
del save
with open('imdb.dict.pkl', 'rb') as f:
dictionary = pickle.load(f, encoding='utf8')
with open('two_dico_embeddings.pickle', 'rb') as f:
save = pickle.load(f)
dico_embedding_id = save['dico_embedding_id']
dico_embedding_word = save['dico_embedding_word']
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
In [3]:
MAXLEN = 500 # we consider sentences up to this length
NO_CUT = 1 # if the sentence's length is greater than MAXLEN, and if NO_CUT,
# then we do not consider it. If not NO_CUT, we cut it
# and keep it up to this length
VOCAB_SIZE = 10000 # number of different words considered
VALID_BATCH_SIZE = 100 # number of sentences for the validation set
SIZE_EMBEDDING = 50 # size of the embeddings for the input vectors
OUTPUT_SIZE = 1 # size of the prediction (0 or 1 in our case)
# ======================================================
RNN_HIDDEN = 200 # size of the LSTM hidden layer
LEARNING_RATE = 0.003 # learning rate
DROPOUT = 0.5 # we apply dropout if DROPOUT > 0 .
DROPOUT = 0 # DROPOUT is the probability we drop the output
TINY = 1e-7 # just to avoid some weird corner case when
# computing the cross-entropy
DO_TEST = 1 # do we do the final evaluation of the model or not
NUM_EPOCHS = 100
ITERATONS_PER_EPOCH = 15
BATCH_SIZE = 20
In [4]:
def set_config(config_dict=None, **kwargs):
"""Modify hyperparameters'values as simple as a call `set_config(VARNAME=value)`, or
with a complete dict passed"""
for varname, value in kwargs.items():
globals()[varname] = value
if config_dict is not None:
for varname, value in config_dict.items():
globals()[varname] = value
def print_config():
"""Print the current hyperparameters config"""
hypers = [var for var in globals().keys() if var.isupper()]
for h in hypers:
print(h.rjust(20), '=', eval(h))
def get_config():
return dict([(var, val) for var, val in globals().items() if var.isupper()])
In [5]:
print_config()
In [6]:
lengths = [len(seq) for seq in train[0]]+[len(seq) for seq in test[0]]
lengths_500 = ([len(seq) for seq in train[0] if len(seq) < 500]+
[len(seq) for seq in test[0] if len(seq) < 500])
with plt.rc_context({'figure.figsize': (10, 8)}):
sns.distplot(lengths_500, kde=0, bins=100)
Out[6]:
In [7]:
def reduced_set(dataset, maxlen=MAXLEN, no_cut=NO_CUT):
new_seqs, new_labels = [], []
for seq, label in zip(dataset[0], dataset[1]):
if len(seq) > maxlen:
if not NO_CUT:
new_seqs.append(seq)
new_labels.append(label)
else:
new_seqs.append(seq)
new_labels.append(label)
print('new dataset length :', len(new_labels))
print('On a perdu %.2f%% des exemples' %
((len(dataset[1]) - len(new_labels))*100/len(dataset[1]))
)
return (new_seqs, new_labels)
train_reduced = reduced_set(train, maxlen=MAXLEN)
test_reduced = reduced_set(test, maxlen=MAXLEN)
In [8]:
def generalized_set(dataset, vocab_size=VOCAB_SIZE):
return ([[i if i <= vocab_size else 1 for i in seq] for seq in dataset[0]],
dataset[1])
train_reduced2 = generalized_set(train_reduced)
test_reduced2 = generalized_set(test_reduced)
In [9]:
def generate_batch(train, batch_size=BATCH_SIZE):
seqs, labels = train
x = np.zeros((MAXLEN, batch_size, SIZE_EMBEDDING), dtype=np.float)
xmask = np.zeros((MAXLEN, batch_size, 1), dtype=np.float)
y = np.zeros((1, batch_size, 1), dtype=np.float)
idx_batch = random.sample(range(len(labels)), batch_size)
seqs_batch = [seqs[i] for i in idx_batch]
labels_batch = [labels[i] for i in idx_batch]
for j, seq in enumerate(seqs_batch):
for i in range(len(seq)):
x[i,j,:] = dico_embedding_id[seq[i]]
xmask[:len(seq),j,0] = 1
y[0,j,0] = labels_batch[j]
return x, xmask, y
In [10]:
def main(name=None, sets=(train, test), **config_dict):
set_config(config_dict=config_dict)
#####################################################################
############# Sets Preparation ##############
#####################################################################
train, test = sets
train = generalized_set(reduced_set(train))
test = generalized_set(reduced_set(test))
#####################################################################
############# Graph Definition ##############
#####################################################################
with tf.Graph().as_default() as graph:
# Definition of the inputs and outputs
inputs = tf.placeholder(tf.float32, (None, None, SIZE_EMBEDDING)) # time, batch, emb
masks = tf.placeholder(tf.float32, (None, None, 1))
labels = tf.placeholder(tf.float32, (None, None, OUTPUT_SIZE))
_is_training = tf.Variable(initial_value=True, trainable=False)
cell = tf.contrib.rnn.BasicLSTMCell(RNN_HIDDEN)
# maybe add dropout
if DROPOUT and _is_training:
cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=DROPOUT,
output_keep_prob=DROPOUT)
# Definition of the initial state
batch_size = tf.shape(inputs)[1]
initial_state = cell.zero_state(batch_size, tf.float32)
# Computation of the outputs and states
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs,
initial_state=initial_state,
time_major=True)
## Apply the masks
rnn_outputs_masked = tf.multiply(rnn_outputs, masks)
## Averaging on all states (better than taking the last step)
final_outputs = tf.reduce_mean(rnn_outputs_masked, axis=0, keep_dims=True)
# Projection of the outputs
final_projection = lambda x: tf.contrib.layers.linear(x, num_outputs=OUTPUT_SIZE,
activation_fn=tf.nn.sigmoid)
# Application of final projection to the outputs
logits = tf.map_fn(final_projection, final_outputs)
# Loss
loss = -(labels*tf.log(logits + TINY) + (1.0 - labels)*tf.log(1.0 - logits + TINY))
loss = tf.reduce_mean(loss)
# train_optimizer
train_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE).minimize(loss)
# For validation purpose
accuracy = tf.reduce_mean(tf.cast(abs(logits - labels) < 0.5, tf.float32))
###########################################################################
######## Training Loop ########
###########################################################################
logs = defaultdict(list)
valid_x, valid_mask, valid_y = generate_batch(test_reduced2, batch_size=VALID_BATCH_SIZE)
with tf.Session(graph=graph) as session:
session.run(tf.global_variables_initializer())
for i in range(NUM_EPOCHS):
epoch_loss = 0
for j in range(ITERATONS_PER_EPOCH):
x, m, y = generate_batch(train_reduced2, batch_size=BATCH_SIZE)
_loss, _, train_accuracy = session.run([loss, train_optimizer, accuracy],
feed_dict={inputs:x,
masks:m,
labels:y,
_is_training:True})
epoch_loss += _loss
valid_accuracy = session.run(accuracy,
feed_dict={inputs:valid_x,
masks:valid_mask,
labels:valid_y,
_is_training:False})
print('Iteration : %d, Loss = %.8f' % (i, epoch_loss/ITERATONS_PER_EPOCH))
print('Accuracy = %.1f\n' % (valid_accuracy*100.))
logs['loss'].append(epoch_loss)
logs['train_accuracy'].append(train_accuracy)
logs['valid_accuracy'].append(valid_accuracy)
if DO_TEST:
# Here comes the final test ; I split the test batch in sub_batch, otherwise
# my RAM (8GB) wasn't enough
TEST_SIZE = 8000
NB_TESTS = 50
test_x, test_mask, test_y = generate_batch(test_reduced2,
batch_size=min(len(test_reduced2[0]),
TEST_SIZE))
tsize = TEST_SIZE//NB_TESTS
results = np.zeros(NB_TESTS, dtype=np.float64)
for i in range(NB_TESTS):
sub_test_x = test_x[:,i*tsize:(i*tsize+tsize),:]
sub_test_mask = test_mask[:,i*tsize:(i*tsize+tsize),:]
sub_test_y = test_y[:,i*tsize:(i*tsize+tsize),:]
sub_test_acc = session.run(accuracy,
feed_dict={inputs:sub_test_x,
masks:sub_test_mask,
labels:sub_test_y,
_is_training:False})
results[i] = sub_test_acc*tsize
print(i)
test_accuracy = results.sum()/TEST_SIZE
print('Test Accuracy = %.1f' % (test_accuracy*100.))
logs['test_accuracy'] = test_accuracy
logs['config'] = get_config()
pickle_name = name+"__"+str(config_dict) if (name is not None) else str(config_dict)
with open(os.path.join('logs', pickle_name+'.pickle'), 'wb') as f:
pickle.dump(logs, f)
print("logs \"%s\" pickled" % name)
In [19]:
main(name='dropout-200', DROPOUT=0.5, RNN_HIDDEN=200)
In [21]:
with open("logs/standard__{}.pickle", "rb") as f:
logs_no_dropout = pickle.load(f)
with open("logs/dropout-200__{'DROPOUT': 0.5, 'RNN_HIDDEN': 200}.pickle", "rb") as f:
logs_dropout_200 = pickle.load(f)
with open("logs/dropout-400__{'DROPOUT': 0.5, 'RNN_HIDDEN': 400}.pickle", "rb") as f:
logs_dropout_400 = pickle.load(f)
with open("logs/dropout__{'VOCAB_SIZE': 10000, 'RNN_HIDDEN': 500, 'DROPOUT': 0.5, 'NUM_EPOCHS': 100}.pickle", "rb") as f:
logs_dropout_500 = pickle.load(f)
with open("logs/dropout__{'DROPOUT': 0.5, 'RNN_HIDDEN': 600, 'VOCAB_SIZE': 10000, 'NUM_EPOCHS': 100}.pickle", "rb") as f:
logs_dropout_600 = pickle.load(f)
In [22]:
logs = logs_no_dropout
logs2 = logs_dropout_200
logs3 = logs_dropout_400
logs4 = logs_dropout_500
logs5 = logs_dropout_600
supp = range(ITERATONS_PER_EPOCH, NUM_EPOCHS*ITERATONS_PER_EPOCH+1, ITERATONS_PER_EPOCH)
with plt.rc_context({'figure.figsize': (14, 10)}):
fig, ax = plt.subplots()
l2 = ax.plot(supp, logs['valid_accuracy'], color='#99e699')
l2_1 = ax.plot(supp, logs2['valid_accuracy'], color='#33cc33')
l2_2 = ax.plot(supp, logs3['valid_accuracy'], color='#1f7a1f')
l2_3 = ax.plot(supp, logs4['valid_accuracy'], color='#264d00')
l2_4 = ax.plot(supp, logs5['valid_accuracy'], color='#1a1a1a')
# on affiche le score final au test
l4 = ax.scatter(supp[-1], logs['test_accuracy'], color="#99e699")
l4_1 = ax.scatter(supp[-1], logs2['test_accuracy'], color="#33cc33")
l4_2 = ax.scatter(supp[-1], logs3['test_accuracy'], color="#1f7a1f")
l4_3 = ax.scatter(supp[-1], logs3['test_accuracy'], color="#264d00")
l4_4 = ax.scatter(supp[-1], logs4['test_accuracy'], color="#1a1a1a")
ax.set_xlabel('Itérations')
ax.set_ylabel('Exactitudes en ratio')
ax.legend(l2+l2_1+l2_2+l2_3+l2_4+[l4,l4_1,l4_2,l4_3, l4_4],
['exactitude de validation sans dropout', "dropout, RNN_HIDDEN=200",
"dropout, RNN_HIDDEN=400", "dropout, RNN_HIDDEN=500",
"dropout, RNN_HIDDEN=600",
'coût sans dropout', "dropout, RNN_HIDDEN=200", "dropout, RNN_HIDDEN=400",
'dropout, RNN_HIDDEN=500', "dropout, RNN_HIDDEN=600",
'exactitude de test sans dropout', "dropout, RNN_HIDDEN=400",
"dropout, RNN_HIDDEN=400", 'dropout, RNN_HIDDEN=500',
"dropout, RNN_HIDDEN=600"],
loc='lower left',
frameon=True)
frame = ax.legend_.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
1;
In [25]:
logs = logs_no_dropout
logs2 = logs_dropout_400
logs3 = logs_dropout_500
logs4 = logs_dropout_600
supp = range(ITERATONS_PER_EPOCH, NUM_EPOCHS*ITERATONS_PER_EPOCH+1, ITERATONS_PER_EPOCH)
with plt.rc_context({'figure.figsize': (14, 10)}):
fig, ax = plt.subplots()
ax.set_xlabel('Itérations')
ax.set_ylabel("Fonction coût")
with sns.axes_style('white'):
ax2 = ax.twinx()
l3 = ax2.plot(supp, logs['loss'], color='#ff9980')
l33 = ax2.plot(supp, logs2['loss'], color='#ff3300')
l333 = ax2.plot(supp, logs3['loss'], color='#991f00')
l3333 = ax2.plot(supp, logs4['loss'], color='#1a1a1a')
ax2.legend(l3+l33+l333+l3333,
['coût sans dropout', "dropout, RNN_HIDDEN=400", 'dropout, RNN_HIDDEN=500',
"dropout, RNN_HIDDEN=600"],
loc='lower left',
frameon=True)
frame = ax2.legend_.get_frame()
frame.set_facecolor('white')
frame.set_edgecolor('black')
1;
In [ ]: