In [1]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers.convolutional import Convolution2D
from keras.optimizers import Adam
import keras
import theano
import random
import os
import pickle
# plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
keras.__version__
Out[2]:
In [3]:
theano.__version__
Out[3]:
In [4]:
plt.rcParams["patch.force_edgecolor"] = True
sns.set_style('whitegrid',
{'axes.grid': True,
'grid.linestyle': u'--',
'axes.edgecolor': '0.1',
'axes.labelcolor': '0',
'axes.labelsize': 15,
'axes.titlesize': 15,
'legend.fontsize': 15,
'xtick.labelsize': 15,
'ytick.labelsize': 15,
})
In [5]:
model_name = 'Random_UTR_CNN'
model_params_dir = '../Results/{0}.Hyperparam.Opt/'.format(model_name)
In [6]:
results_dir = '../Results/{0}.ModelPredictions/'.format(model_name)
if not os.path.exists(model_params_dir):
os.mkdir(model_params_dir)
In [7]:
data_dir = '../Data/'
native_data = pd.read_csv(data_dir + 'Native_UTRs.csv', index_col = 0)
i.e. we're converting the sequences from being represented as a 50 character string of bases to a 4x50 matrix of 1's and 0's, with each row corresponding to a base and every column a position in the UTR.
Note that we're doing the indexing a little differently than in Notebook 1 -- see comments in function
In [8]:
# one hot encoding of UTRs
# X = one hot encoding matrix
# Y = growth rates
def one_hot_encoding(df, seq_column, expression):
bases = ['A','C','G','T']
base_dict = dict(zip(bases,range(4))) # {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
n = len(df)
# length of the UTR sequence
# we also add 10 empty spaces to either side
total_width = df[seq_column].str.len().max() + 20
# initialize an empty numpy ndarray of the appropriate size
X = np.zeros((n, 1, 4, total_width))
# an array with the sequences that we will one-hot encode
seqs = df[seq_column].values
# loop through the array of sequences to create an array that keras will actually read
for i in range(n):
seq = seqs[i]
# loop through each individual sequence, from the 5' to 3' end
for b in range(len(seq)):
# this will assign a 1 to the appropriate base and position for this UTR sequence
# Note that this is different than the same function in Notebook #1 (since we're dealing
# with sequences with nonuniform lengths)
X[i, 0, base_dict[seq[b]], b + 10 + 50 - len(seq)] = 1.
# keep track of where we are
if (i%10000)==0:
print i,
X = X.astype(theano.config.floatX)
Y = np.asarray(df[expression].values,
dtype = theano.config.floatX)[:, np.newaxis]
return X, Y, total_width
In [9]:
X, Y, total_width = one_hot_encoding(native_data, 'UTR', 'growth_rate')
If we have more reads for a given UTR at the outset, we can be more confident that we have made an accurate measurement. For this reason, we use those UTRs with the most reads to test our model on, because these should have the least experimental noise.
In [10]:
# a numpy array of the indexes of UTRs with > 100 reads
test_inds = native_data.loc[native_data.t0 > 100].index.values
In [11]:
!ls {model_params_dir}
In [12]:
model = keras.models.model_from_json(open(model_params_dir + 'model_arch.json').read())
model.load_weights(model_params_dir + 'model_weights.hdf5')
model.compile(loss='mean_squared_error', optimizer='adam')
In [13]:
Y_pred = model.predict(X, verbose=1)
In [14]:
# data
x = Y_pred[test_inds].flatten()
y = Y[test_inds].flatten()
# calculate R^2
r2 = scipy.stats.pearsonr(x, y)[0]**2
g = sns.jointplot(x,
y,
stat_func = None,
kind = 'scatter',
s = 5,
alpha = 0.25,
size = 5)
g.ax_joint.set_xlabel('Predicted log$_2$ Growth Rate')
g.ax_joint.set_ylabel('Measured log$_2$ Growth Rate')
text = "R$^2$ = {:0.2}".format(r2)
plt.annotate(text, xy=(-5.5, 0.95), xycoords='axes fraction')
plt.title("CNN predictions of native 5' UTR HIS3 data", x = -3, y = 1.25)
Out[14]:
In [15]:
native_data['pred_growth_rate'] = Y_pred
In [ ]:
native_data.to_csv(results_dir + 'Random_UTRs_with_predictions.csv')