In [1]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers.convolutional import Convolution2D
from keras.optimizers import Adam
import keras
import theano
import random
import os
import pickle

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


Using Theano backend.

In [2]:
keras.__version__


Out[2]:
'1.2.2'

In [3]:
theano.__version__


Out[3]:
'0.9.0'

Set plotting style


In [4]:
plt.rcParams["patch.force_edgecolor"] = True
sns.set_style('whitegrid',
              {'axes.grid': True,
               'grid.linestyle': u'--',
               'axes.edgecolor': '0.1',
               'axes.labelcolor': '0',
               'axes.labelsize': 15,
               'axes.titlesize': 15,
               'legend.fontsize': 15,
               'xtick.labelsize': 15,
               'ytick.labelsize': 15,
               })

The directory that contains information about the model parameters


In [5]:
model_name = 'Random_UTR_CNN'
model_params_dir = '../Results/{0}.Hyperparam.Opt/'.format(model_name)

Create a directory to save results:

(if it doesn't already exist)


In [6]:
results_dir = '../Results/{0}.ModelPredictions/'.format(model_name)

if not os.path.exists(model_params_dir):
    os.mkdir(model_params_dir)

Load the cleaned up data.

The csv should be tab-separated. The read counts are log2.


In [7]:
data_dir = '../Data/'

native_data = pd.read_csv(data_dir + 'Native_UTRs.csv', index_col = 0)

One-hot encoding of the sequences.

i.e. we're converting the sequences from being represented as a 50 character string of bases to a 4x50 matrix of 1's and 0's, with each row corresponding to a base and every column a position in the UTR.

Note that we're doing the indexing a little differently than in Notebook 1 -- see comments in function


In [8]:
# one hot encoding of UTRs
# X = one hot encoding matrix
# Y = growth rates

def one_hot_encoding(df, seq_column, expression):
    
    bases = ['A','C','G','T']
    base_dict = dict(zip(bases,range(4))) # {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}

    n = len(df)
    
    # length of the UTR sequence
    # we also add 10 empty spaces to either side
    total_width = df[seq_column].str.len().max() + 20
    
    # initialize an empty numpy ndarray of the appropriate size
    X = np.zeros((n, 1, 4, total_width))
    
    # an array with the sequences that we will one-hot encode
    seqs = df[seq_column].values
    
    # loop through the array of sequences to create an array that keras will actually read
    for i in range(n):
        seq = seqs[i]
        
        # loop through each individual sequence, from the 5' to 3' end
        for b in range(len(seq)):
            # this will assign a 1 to the appropriate base and position for this UTR sequence
            # Note that this is different than the same function in Notebook #1 (since we're dealing
            # with sequences with nonuniform lengths)
            X[i, 0, base_dict[seq[b]], b + 10 + 50 - len(seq)] = 1.
    
        # keep track of where we are
        if (i%10000)==0:
            print i,
        
    X = X.astype(theano.config.floatX)
    Y = np.asarray(df[expression].values,
                   dtype = theano.config.floatX)[:, np.newaxis]
    
    return X, Y, total_width

In [9]:
X, Y, total_width = one_hot_encoding(native_data, 'UTR', 'growth_rate')


0 10000

Record indexes for UTRs with >100 reads in the input

If we have more reads for a given UTR at the outset, we can be more confident that we have made an accurate measurement. For this reason, we use those UTRs with the most reads to test our model on, because these should have the least experimental noise.


In [10]:
# a numpy array of the indexes of UTRs with > 100 reads
test_inds = native_data.loc[native_data.t0 > 100].index.values

Load model trained on random 5' UTRs


In [11]:
!ls {model_params_dir}


hyperparam_test.pkl        model_weights.h5
hyperparam_test_binary.pkl model_weights.hdf5
model_arch.json

In [12]:
model = keras.models.model_from_json(open(model_params_dir + 'model_arch.json').read())
model.load_weights(model_params_dir + 'model_weights.hdf5')
model.compile(loss='mean_squared_error', optimizer='adam')

In [13]:
Y_pred = model.predict(X, verbose=1)


11840/11856 [============================>.] - ETA: 0s

Plot results


In [14]:
# data
x = Y_pred[test_inds].flatten()
y = Y[test_inds].flatten()

# calculate R^2
r2 = scipy.stats.pearsonr(x, y)[0]**2


g = sns.jointplot(x,
                  y,
                  stat_func = None,
                  kind = 'scatter',
                  s = 5,
                  alpha = 0.25,
                  size = 5)

g.ax_joint.set_xlabel('Predicted log$_2$ Growth Rate')
g.ax_joint.set_ylabel('Measured log$_2$ Growth Rate')


text = "R$^2$ = {:0.2}".format(r2)
plt.annotate(text, xy=(-5.5, 0.95), xycoords='axes fraction')

plt.title("CNN predictions of native 5' UTR HIS3 data", x = -3, y = 1.25)


Out[14]:
<matplotlib.text.Text at 0x1195654d0>

Save data and predictions to csv


In [15]:
native_data['pred_growth_rate'] = Y_pred

In [ ]:
native_data.to_csv(results_dir + 'Random_UTRs_with_predictions.csv')