In [1]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers.convolutional import Convolution2D
from keras.optimizers import Adam
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import hyperopt.fmin as hypfmin
import keras
import theano
import random
import os
import pickle
# plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
keras.__version__
Out[2]:
In [3]:
theano.__version__
Out[3]:
In [4]:
plt.rcParams["patch.force_edgecolor"] = True
sns.set_style('whitegrid',
{'axes.grid': True,
'grid.linestyle': u'--',
'axes.edgecolor': '0.1',
'axes.labelcolor': '0',
'axes.labelsize': 15,
'axes.titlesize': 15,
'legend.fontsize': 15,
'xtick.labelsize': 15,
'ytick.labelsize': 15,
})
In [ ]:
data_dir = '../Data/'
results_dir = '../Results/'
figures_dir = '../Figures/'
if not os.path.exists(data_dir):
os.mkdir(data_dir)
if not os.path.exists(results_dir):
os.mkdir(results_dir)
if not os.path.exists(figures_dir):
os.mkdir(figures_dir)
In [5]:
model_name = 'Random_UTR_CNN'
model_params_dir = '../Results/{0}.Hyperparam.Opt/'.format(model_name)
if not os.path.exists(model_params_dir):
os.mkdir(model_params_dir)
In [6]:
data = pd.read_csv(data_dir + 'Random_UTRs.csv')
In [7]:
# one hot encoding of UTRs
# X = one hot encoding matrix
# Y = growth rates
def one_hot_encoding(df, seq_column, expression):
bases = ['A','C','G','T']
base_dict = dict(zip(bases,range(4))) # {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
n = len(df)
# length of the UTR sequence
# we also add 10 empty spaces to either side
total_width = df[seq_column].str.len().max() + 20
# initialize an empty numpy ndarray of the appropriate size
X = np.zeros((n, 1, 4, total_width))
# an array with the sequences that we will one-hot encode
seqs = df[seq_column].values
# loop through the array of sequences to create an array that keras will actually read
for i in range(n):
seq = seqs[i]
# loop through each individual sequence, from the 5' to 3' end
for b in range(len(seq)):
# this will assign a 1 to the appropriate base and position for this UTR sequence
X[i, 0, base_dict[seq[b]], int(b + round((total_width - len(seq))/2.))] = 1.
# keep track of where we are
if (i%10000)==0:
print i,
X = X.astype(theano.config.floatX)
Y = np.asarray(df[expression].values,
dtype = theano.config.floatX)[:, np.newaxis]
return X, Y, total_width
In [8]:
X, Y, total_width = one_hot_encoding(data, 'UTR', 'growth_rate')
If we have more reads for a given UTR at the outset, we can be more confident that we have made an accurate measurement. For this reason, we use those UTRs with the most reads to test our model on, because these should have the least experimental noise. We hold out the UTRs that fall in top 5% of reads at the first time point as our test set.
In [9]:
# a sorted numpy array of UTR indexes, from least reads to most reads
sorted_inds = data.sort_values('t0').index.values
train_inds = sorted_inds[:int(0.95*len(sorted_inds))] # 95% of the data as the training set
test_inds = sorted_inds[int(0.95*len(sorted_inds)):] # UTRs with most reads at time point 0 as the test set
# set the seed before randomly shuffling the data
seed = 0.5
random.shuffle(train_inds, lambda :seed)
Before training the model, we perform a hyperparameter search to narrow down which model architecture to use. Of course, we do a fair amount of narrowing ourselves by selecting which architectures are available for the search.
The dictionary 'hyperparams' has the same values as the 'space' object, 'space' is just the form that's compatible with hyperopt.
In [ ]:
hyperparams = {'conv_width' : [9, 13, 17, 25],
'conv_filters' : [32, 64, 128, 256],
'conv_layers' : [2, 3, 4],
'dense_layers' : [1, 2],
'conv_dropout' : [None, 0.15],
'dense_dropout' : [None, 0.1, 0.25, 0.5],
'dense_units' : [32, 64, 128, 256]}
In [ ]:
space = { 'conv_width': hp.choice('conv_width', [9, 13, 17, 25]),
'conv_filters': hp.choice('conv_filters', [32, 64, 128, 256]),
'conv_layers': hp.choice('conv_layers', [2, 3, 4]),
'dense_layers': hp.choice('dense_layers', [1, 2]),
'conv_dropout': hp.choice('conv_dropout', [None, 0.15]),
'dense_dropout': hp.choice('dense_dropout', [None, 0.1, 0.25, 0.5]),
'dense_units': hp.choice('dense_units', [32, 64, 128, 256]),
}
In [ ]:
def create_model(params):
model = Sequential()
model.add(Convolution2D(params['conv_filters'],
4,
params['conv_width'],
border_mode = 'valid',
input_shape = (1, 4, total_width),
activation = 'relu'))
# add dropout at the convolutional layers if appropriate
if params['conv_dropout']:
model.add(Dropout(p = params['conv_dropout']))
# add the appropriate number of additional convolutional layers, along with dropout
for i in range(params['conv_layers'] - 1):
model.add(Convolution2D(params['conv_filters'],
1,
params['conv_width'],
border_mode = 'same',
activation = 'relu'))
if params['conv_dropout']:
model.add(Dropout(params['conv_dropout']))
model.add(Flatten())
# add the appropriate number of dense layers and dropout
for i in range(params['dense_layers']):
model.add(Dense(output_dim = params['dense_units'],
activation = 'relu'))
if params['dense_dropout']:
model.add(Dropout(p = params['dense_dropout']))
# add the output layer, since we want to predict the "growth rate" we only want a single
# number, hence the single dimension
model.add(Dense(output_dim = 1))
# compile the model
model.compile(loss = 'mean_squared_error',
optimizer = 'adam',
metrics = ['mean_squared_error'])
return model
In [ ]:
def f_nn(params):
model = create_model(params)
# track model overfitting
earlyStopping = keras.callbacks.EarlyStopping(monitor = 'val_loss',
patience = 1,
verbose = 0,
mode = 'auto')
history = keras.callbacks.History()
# keep track of where we are while the code in this cell is running
global n
print "\n", n
n+=1
print params
# fit the model
# note that I'm not passing the data to this function, I've just included it here (i.e. I've
# included X and Y)
model.fit(X[train_inds],
Y[train_inds],
validation_split = 0.2,
callbacks = [earlyStopping, history],
verbose = 0,
nb_epoch = 100)
print 'MSE:',earlyStopping.best
return {'loss': earlyStopping.best, 'status': STATUS_OK}
In [ ]:
n = 0
trials = Trials()
best = hypfmin(f_nn, space,
algo = tpe.suggest,
max_evals = 50,
trials = trials)
print 'best: '
print best
pickle the model
In [ ]:
with open(model_params_dir + 'hyperparam_test.pkl', 'w') as f:
pickle.dump(trials.trials, f)
and take a look a the winning architecture
In [ ]:
opt_params = {}
for p in best:
opt_params[p] = hyperparams[p][best[p]]
In [ ]:
opt_params
In [ ]:
model = create_model(opt_params)
In [ ]:
earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss',
patience = 0,
verbose = 0,
mode = 'auto')
history = keras.callbacks.History()
modelcheckpoint = keras.callbacks.ModelCheckpoint(model_params_dir + 'model_weights.hdf5',
monitor = 'val_loss',
verbose = 0,
save_best_only = True,
mode = 'auto')
In [ ]:
model.fit(X[train_inds],
Y[train_inds],
validation_split = 0.1,
callbacks = [earlyStopping,
history,
modelcheckpoint],
verbose=1,
nb_epoch = 100)
In [ ]:
json_string = model.to_json()
open(model_params_dir + 'model_arch.json', 'w').write(json_string)
model.save_weights(model_params_dir + 'model_weights.h5')
In [ ]:
Y_pred = model.predict(X,verbose=1)
In [ ]:
# R^2 value for our predictions on the training set
print scipy.stats.pearsonr(Y[train_inds].flatten(),
Y_pred[train_inds].flatten())[0]**2
# and on the test set
print scipy.stats.pearsonr(Y[test_inds].flatten(),
Y_pred[test_inds].flatten())[0]**2
In [10]:
!ls {model_params_dir}
In [11]:
model = keras.models.model_from_json(open(model_params_dir + 'model_arch.json').read())
model.load_weights(model_params_dir + 'model_weights.hdf5')
model.compile(loss='mean_squared_error', optimizer='adam')
In [12]:
Y_pred = model.predict(X, verbose=1)
In [13]:
# R^2 value for our predictions on the training set
print scipy.stats.pearsonr(Y[train_inds].flatten(),
Y_pred[train_inds].flatten())[0]**2
# and on the test set
print scipy.stats.pearsonr(Y[test_inds].flatten(),
Y_pred[test_inds].flatten())[0]**2
In [14]:
# data
x = Y_pred[test_inds].flatten()
y = Y[test_inds].flatten()
# calculate R^2
r2 = scipy.stats.pearsonr(x, y)[0]**2
g = sns.jointplot(x,
y,
stat_func = None,
kind = 'scatter',
s = 5,
alpha = 0.1,
size = 5)
g.ax_joint.set_xlabel('Predicted log$_2$ Growth Rate')
g.ax_joint.set_ylabel('Measured log$_2$ Growth Rate')
text = "R$^2$ = {:0.2}".format(r2)
plt.annotate(text, xy=(-5.5, 0.95), xycoords='axes fraction')
plt.title("CNN predictions vs. test set", x = -3, y = 1.25)
Out[14]:
In [ ]:
data['pred_growth_rate'] = Y_pred
In [ ]:
data.to_csv(data_dir + 'Random_UTRs_with_predictions.csv')