In [1]:
# Import all the necessary modules
import os
import sys
os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,optimizer=None,device=cpu,floatX=float32"
sys.path.insert(0,'..')
import numpy as np
import theano
import theano.tensor as T
import lasagne
from confusionmatrix import ConfusionMatrix
from utils import iterate_minibatches, LSTMAttentionDecodeFeedbackLayer
import matplotlib.pyplot as plt
import time
import itertools
%matplotlib inline
The first thing that we have to do is to define the network architecture. Here we are going to use an input layer, two convolutional layers, a bidirectional LSTM, an attention layer, a dense layer and an output layer. These are the steps that we are going to follow:
1.- Specify the hyperparameters of the network:
In [2]:
batch_size = 128
seq_len = 400
n_feat = 20
n_hid = 15
n_class = 10
lr = 0.0025
n_filt = 10
drop_prob = 0.5
2.- Define the input variables to our network:
In [3]:
# We use ftensor3 because the protein data is a 3D-matrix in float32
input_var = T.ftensor3('inputs')
# ivector because the labels is a single dimensional vector of integers
target_var = T.ivector('targets')
# fmatrix because the masks to ignore the padded positions is a 2D-matrix in float32
mask_var = T.fmatrix('masks')
# Dummy data to check the size of the layers during the building of the network
X = np.random.randint(0,10,size=(batch_size,seq_len,n_feat)).astype('float32')
Xmask = np.ones((batch_size,seq_len)).astype('float32')
3.- Define the layers of the network:
In [4]:
# Input layer, holds the shape of the data
l_in = lasagne.layers.InputLayer(shape=(batch_size, None, n_feat), input_var=input_var, name='Input')
print('Input layer: {}'.format(
lasagne.layers.get_output(l_in, inputs={l_in: input_var}).eval({input_var: X}).shape))
# Mask input layer
l_mask = lasagne.layers.InputLayer(shape=(batch_size, None), input_var=mask_var, name='Mask')
print('Mask layer: {}'.format(
lasagne.layers.get_output(l_mask, inputs={l_mask: mask_var}).eval({mask_var: Xmask}).shape))
# Shuffle shape to be properly read by the CNN layer
l_shu = lasagne.layers.DimshuffleLayer(l_in, (0,2,1))
print('DimshuffleLayer layer: {}'.format(
lasagne.layers.get_output(l_shu, inputs={l_in: input_var}).eval({input_var: X}).shape))
# Convolutional layers with different filter size
l_conv_a = lasagne.layers.Conv1DLayer(l_shu, num_filters=n_filt, pad='same', stride=1,
filter_size=3, nonlinearity=lasagne.nonlinearities.rectify)
print('Convolutional layer size 3: {}'.format(
lasagne.layers.get_output(l_conv_a, inputs={l_in: input_var}).eval({input_var: X}).shape))
l_conv_b = lasagne.layers.Conv1DLayer(l_shu, num_filters=n_filt, pad='same', stride=1,
filter_size=5, nonlinearity=lasagne.nonlinearities.rectify)
print('Convolutional layer size 5: {}'.format(
lasagne.layers.get_output(l_conv_b, inputs={l_in: input_var}).eval({input_var: X}).shape))
# The output is concatenated
l_conc = lasagne.layers.ConcatLayer([l_conv_a, l_conv_b], axis=1)
print('Concatenated convolutional layers: {}'.format(
lasagne.layers.get_output(l_conc, inputs={l_in: input_var}).eval({input_var: X}).shape))
# Second CNN layer
l_conv_final = lasagne.layers.Conv1DLayer(l_conc, num_filters=n_filt*2, pad='same',
stride=1, filter_size=3,
nonlinearity=lasagne.nonlinearities.rectify)
print('Final convolutional layer: {}'.format(
lasagne.layers.get_output(l_conv_final, inputs={l_in: input_var}).eval({input_var: X}).shape))
l_reshu = lasagne.layers.DimshuffleLayer(l_conv_final, (0,2,1))
print('Second DimshuffleLayer layer: {}'.format(
lasagne.layers.get_output(l_reshu, inputs={l_in: input_var}).eval({input_var: X}).shape))
l_fwd = lasagne.layers.LSTMLayer(l_reshu, num_units=n_hid, name='LSTMFwd', mask_input=l_mask,
nonlinearity=lasagne.nonlinearities.tanh)
l_bck = lasagne.layers.LSTMLayer(l_reshu, num_units=n_hid, name='LSTMBck', mask_input=l_mask,
backwards=True, nonlinearity=lasagne.nonlinearities.tanh)
print('Forward LSTM layer: {}'.format(
lasagne.layers.get_output(l_fwd, inputs={l_in: input_var, l_mask: mask_var}).eval(
{input_var: X, mask_var:Xmask}).shape))
print('Backward LSTM layer: {}'.format(
lasagne.layers.get_output(l_bck, inputs={l_in: input_var, l_mask: mask_var}).eval(
{input_var: X, mask_var:Xmask}).shape))
# Concatenate both layers
l_conc_lstm = lasagne.layers.ConcatLayer([l_fwd, l_bck], axis=2)
print('Concatenated hidden states: {}'.format(
lasagne.layers.get_output(l_conc_lstm, inputs={l_in: input_var, l_mask: mask_var}).eval(
{input_var: X, mask_var:Xmask}).shape))
l_att = LSTMAttentionDecodeFeedbackLayer(l_conc_lstm, mask_input=l_mask,
num_units=n_hid*2, aln_num_units=n_hid,
n_decodesteps=2, name='LSTMAttention')
print('Attention layer: {}'.format(
lasagne.layers.get_output(l_att, inputs={l_in: input_var, l_mask: mask_var}).eval(
{input_var: X, mask_var:Xmask}).shape))
l_last_hid = lasagne.layers.SliceLayer(l_att, indices=-1, axis=1)
print('Last decoding step: {}'.format(
lasagne.layers.get_output(l_last_hid, inputs={l_in: input_var, l_mask: mask_var}).eval(
{input_var: X, mask_var:Xmask}).shape))
# Dense layer with ReLu activation function
l_dense = lasagne.layers.DenseLayer(l_last_hid, num_units=n_hid*2, name="Dense",
nonlinearity=lasagne.nonlinearities.rectify)
print('Dense layer: {}'.format(
lasagne.layers.get_output(l_dense, inputs={l_in: input_var, l_mask: mask_var}).eval(
{input_var: X, mask_var:Xmask}).shape))
# Output layer with a Softmax activation function. Note that we include a dropout layer
l_out = lasagne.layers.DenseLayer(lasagne.layers.dropout(l_dense, p=drop_prob), num_units=n_class, name="Softmax",
nonlinearity=lasagne.nonlinearities.softmax)
print('Output layer: {}'.format(
lasagne.layers.get_output(l_out, inputs={l_in: input_var, l_mask: mask_var}).eval(
{input_var: X, mask_var:Xmask}).shape))
4.- Calculate the prediction and network loss for the training set and update the network weights:
In [5]:
# Get output training, deterministic=False is used for training
prediction = lasagne.layers.get_output(l_out, inputs={l_in: input_var, l_mask: mask_var}, deterministic=False)
# Calculate the categorical cross entropy between the labels and the prediction
t_loss = T.nnet.categorical_crossentropy(prediction, target_var)
# Training loss
loss = T.mean(t_loss)
# Parameters
params = lasagne.layers.get_all_params([l_out], trainable=True)
# Get the network gradients and perform total norm constraint normalization
all_grads = lasagne.updates.total_norm_constraint(T.grad(loss, params),3)
# Update parameters using ADAM
updates = lasagne.updates.adam(all_grads, params, learning_rate=lr)
5.- Calculate the prediction and network loss for the validation set:
In [6]:
# Get output validation, deterministic=True is only use for validation
val_prediction = lasagne.layers.get_output(l_out, inputs={l_in: input_var, l_mask: mask_var}, deterministic=True)
# Calculate the categorical cross entropy between the labels and the prediction
t_val_loss = lasagne.objectives.categorical_crossentropy(val_prediction, target_var)
# Validation loss
val_loss = T.mean(t_val_loss)
6.- Build theano functions:
In [ ]:
# Build functions
train_fn = theano.function([input_var, target_var, mask_var], [loss, prediction], updates=updates)
val_fn = theano.function([input_var, target_var, mask_var], [val_loss, val_prediction, l_att.alpha])
In [8]:
# Load the encoded protein sequences, labels and masks
train = np.load('data/reduced_train.npz')
X_train = train['X_train']
y_train = train['y_train']
mask_train = train['mask_train']
print(X_train.shape)
In [9]:
validation = np.load('data/reduced_val.npz')
X_val = validation['X_val']
y_val = validation['y_val']
mask_val = validation['mask_val']
print(X_val.shape)
In [10]:
# Number of epochs
num_epochs = 120
# Lists to save loss and accuracy of each epoch
loss_training = []
loss_validation = []
acc_training = []
acc_validation = []
start_time = time.time()
min_val_loss = float("inf")
# Start training
for epoch in range(num_epochs):
# Full pass training set
train_err = 0
train_batches = 0
confusion_train = ConfusionMatrix(n_class)
# Generate minibatches and train on each one of them
for batch in iterate_minibatches(X_train.astype(np.float32), y_train.astype(np.int32),
mask_train.astype(np.float32), batch_size, shuffle=True):
# Inputs to the network
inputs, targets, in_masks = batch
# Calculate loss and prediction
tr_err, predict = train_fn(inputs, targets, in_masks)
train_err += tr_err
train_batches += 1
# Get the predicted class, the one with the maximum likelihood
preds = np.argmax(predict, axis=-1)
confusion_train.batch_add(targets, preds)
# Average loss and accuracy
train_loss = train_err / train_batches
train_accuracy = confusion_train.accuracy()
cf_train = confusion_train.ret_mat()
val_err = 0
val_batches = 0
confusion_valid = ConfusionMatrix(n_class)
# Generate minibatches and validate on each one of them, same procedure as before
for batch in iterate_minibatches(X_val.astype(np.float32), y_val.astype(np.int32),
mask_val.astype(np.float32), batch_size, shuffle=True):
inputs, targets, in_masks = batch
err, predict_val, alphas = val_fn(inputs, targets, in_masks)
val_err += err
val_batches += 1
preds = np.argmax(predict_val, axis=-1)
confusion_valid.batch_add(targets, preds)
val_loss = val_err / val_batches
val_accuracy = confusion_valid.accuracy()
cf_val = confusion_valid.ret_mat()
loss_training.append(train_loss)
loss_validation.append(val_loss)
acc_training.append(train_accuracy)
acc_validation.append(val_accuracy)
# Save the model parameters at the epoch with the lowest validation loss
if min_val_loss > val_loss:
min_val_loss = val_loss
np.savez('params/CNN-LSTM-Attention_params.npz', *lasagne.layers.get_all_param_values(l_out))
print("Epoch {} of {} time elapsed {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time))
print(" training loss:\t\t{:.6f}".format(train_loss))
print(" validation loss:\t\t{:.6f}".format(val_loss))
print(" training accuracy:\t\t{:.2f} %".format(train_accuracy * 100))
print(" validation accuracy:\t\t{:.2f} %".format(val_accuracy * 100))
In [11]:
print("Minimum validation loss: {:.6f}".format(min_val_loss))
In [12]:
x_axis = range(num_epochs)
plt.figure(figsize=(8,6))
plt.plot(x_axis,loss_training)
plt.plot(x_axis,loss_validation)
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.legend(('Training','Validation'));
In [13]:
plt.figure(figsize=(8,6))
plt.plot(x_axis,acc_training)
plt.plot(x_axis,acc_validation)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(('Training','Validation'));
In [14]:
# Plot confusion matrix
# Code based on http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
plt.figure(figsize=(8,8))
cmap=plt.cm.Blues
plt.imshow(cf_val, interpolation='nearest', cmap=cmap)
plt.title('Confusion matrix validation set')
plt.colorbar()
tick_marks = np.arange(n_class)
classes = ['Nucleus','Cytoplasm','Extracellular','Mitochondrion','Cell membrane','ER',
'Chloroplast','Golgi apparatus','Lysosome','Vacuole']
plt.xticks(tick_marks, classes, rotation=60)
plt.yticks(tick_marks, classes)
thresh = cf_val.max() / 2.
for i, j in itertools.product(range(cf_val.shape[0]), range(cf_val.shape[1])):
plt.text(j, i, cf_val[i, j],
horizontalalignment="center",
color="white" if cf_val[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True location')
plt.xlabel('Predicted location');
In [15]:
sort_ind = np.argsort(targets)
alphas_1 = alphas[:,1,:][sort_ind]
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,15));
labels_plot = ax1.imshow(targets[sort_ind].reshape(128,1),cmap=plt.get_cmap('Set1'))
ax1.set_aspect(0.3)
ax1.set_axis_off()
cb = plt.colorbar(labels_plot)
labels = np.arange(0,10,1)
loc = labels + .5
cb.set_ticks(loc)
cb.set_ticklabels(classes)
att_plot = ax2.imshow(alphas_1, aspect='auto')
ax2.yaxis.set_visible(False)
plt.tight_layout(pad=25, w_pad=0.5, h_pad=1.0)
In [ ]: