In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers.convolutional import Convolution2D
from keras.optimizers import Adam
import keras
import theano
import os


Using Theano backend.

Create directories to save results:

(if it doesn't already exist)


In [2]:
results_dir = '../Results/EvolvedUTRs/'

if not os.path.exists(results_dir):
    os.mkdir(results_dir)


fig_dir = '../Figures/EvolvedUTRs/'

if not os.path.exists(fig_dir):
    os.mkdir(fig_dir)

Load the model

The directory that contains information about the model parameters


In [3]:
model_name = 'Random_UTR_CNN'
model_params_dir = '../Results/{0}.Hyperparam.Opt/'.format(model_name)

Load model


In [4]:
model = keras.models.model_from_json(open(model_params_dir + 'model_arch.json').read())
model.load_weights(model_params_dir + 'model_weights.hdf5')
model.compile(loss='mean_squared_error', optimizer='adam')

Functions for performing simple evolution

For the sake of simplicity, we've only made point mutations -- i.e no sequence shuffling, deletions, insertions, etc.

Handy global variables:


In [5]:
bases = ['A','C','G','T']
base_dict = dict(zip(bases, range(4)))

One hot encoding of sequences:


In [6]:
def one_hot_encoding(seqs):
    n = len(seqs)
    total_width = 70
    X = np.zeros((n, 1, 4, total_width))
    
    for i in range(n):
        seq = seqs[i]
        
        for b in range(len(seq)):
            X[i, 0, base_dict[seq[b]], 10 + 50 - len(seq) + b] = 1.
            
    return X.astype(theano.config.floatX)

Make mutations in sequence:


In [7]:
def make_mutation(sequences, pos, base):
    
    return [seq[:pos] + base + seq[pos + 1:] if pos < len(seq) else seq for seq in sequences]

Evolve sequences:

Here we make all possible mutations to a given sequence, ask the model to score each, and take the best one. We do this for 40 rounds.


In [8]:
def evolve_seqs(seqs, rounds = 5):
    all_seqs = {}
    all_scores = {}
    all_seqs[0] = seqs
    all_scores[0] = model.predict(one_hot_encoding(seqs)).flatten()
    
    # march through every round of 'evolution'
    for i in range(rounds):
        Y_mut = {}
        
        # loop through the four bases
        for base in bases:
            
            # we'll try each base at each position of the sequences
            for position in range(50):
                mut_seqs = make_mutation(seqs, position, base)                      # make mutated sequence 
                X_mut = one_hot_encoding(mut_seqs)                                  # one-hot-encoding of mutated sequence
                Y_mut[base + '_' + str(position)] = model.predict(X_mut).flatten()  # ask the model what it thinks
        
        # Note which mutations are the best performing for each UTR
        best_muts = pd.DataFrame(Y_mut).apply(np.argmax, axis = 1).values
        
        # Remake these mutations (an example value in best_muts looks would be "T_30", which tells us the nucleotide
        # and the position)
        seqs = [s[:int(m[2:])] + m[:1] + s[int(m[2:]) + 1:] for s, m in zip(seqs, best_muts)]
        
        all_seqs[i + 1] = seqs
        all_scores[i + 1] = model.predict(one_hot_encoding(seqs)).flatten()
        
        # keep track of where we are
        print i, pd.DataFrame(Y_mut).max(1).mean()
        
    return pd.DataFrame(all_seqs), pd.DataFrame(all_scores)

First, evolve native UTR sequences:

Load sequences


In [9]:
native_data = pd.read_csv('../Data/Native_UTRs.csv', index_col = 0)

Select UTRs to evolve

We only want 100, so from those that we are most confident in (i.e. those with >100 reads at time point 0), we choose a range of expression levels.


In [10]:
native_data = native_data[native_data.t0 > 100]

Since we have quite a lot of UTRs still, we also choose to focus on sequences that come from 5' UTR "chunks" that originate from proximal to the start codon. Recall that to test native 5' UTR sequences in our HIS3 growth assay, we broke the sequences into "chunks" of 50 nucleotides or less. Thus, some of the segments originate from sequence adjacent to the start codon, and others come from regions further upstream.


In [11]:
native_data = native_data.loc[native_data['UTR_name'].apply(lambda s: s.split(':')[-1]) == '0']

Select UTRs with a range of expressions -- we don't only want to try to improve the worst or the best ones

We do this by first making an array of 100 desired growth rates evenly spaced between the lowest and highest actual growth rates. To find a UTR that is closest to one of the desired growth rates, we take the absolute value of the difference between the UTR's growth rate and the desired growth rate. Then we take the UTR with the value closest to zero (i.e. it's growth rate is the closest to the desired rate).


In [12]:
# define the range of growth rates
desired_growth_rates = np.linspace(native_data.growth_rate.min(),
                                   native_data.growth_rate.max(),
                                   106)

selected_native_UTRs = []

for i in range(len(desired_growth_rates)):
    
    # with each step we select the UTR with the growth rate closest to the desired growth rate
    selected_native_UTRs.append(abs(native_data.growth_rate - desired_growth_rates[i]).argmin())
    
# just take the unique ones
selected_native_UTRs = pd.Series(np.unique(selected_native_UTRs))

In [13]:
selected_native_UTRs_data = native_data.loc[selected_native_UTRs]

Actually perform the evolutions


In [14]:
# make an array of the UTRs that we want to evolve
seqs = selected_native_UTRs_data.UTR.values

# run them through our evolve_seqs function for 40 rounds
mut_native_seqs = evolve_seqs(seqs, rounds = 40)


0 1.38682425022

Organize the data

reshape and combine the dataframes with the sequences and the predicted growth rates


In [15]:
# I apologize in advance, this is pretty unreadable

# reshape the dataframes. Transpose, then convert into a Multiindex, then convert the index into new columns
seqs_df = mut_native_seqs[0].T.unstack().reset_index()
scores_df = mut_native_seqs[1].T.unstack().reset_index()

# rename the columns so that the names are somewhat descriptive
seqs_df.rename(columns = {'level_0' : 'UTR_id', 'level_1' : 'Round', 0 : 'Evolved_Seq'}, inplace = True)
scores_df.rename(columns = {'level_0' : 'UTR_id', 'level_1' : 'Round', 0 : 'Pred_Growth'}, inplace = True)

# merge two datasets, only include columns that aren't redundant
cols_to_use = scores_df.columns.difference(seqs_df.columns) # identify non-redundant columns
seqs_scores = pd.merge(seqs_df, scores_df[cols_to_use], right_index = True, left_index = True)

add Gene/UTR name information


In [16]:
index_to_name = dict(zip(range(100), selected_native_UTRs_data.UTR_name.values))

seqs_scores['UTR_name'] = seqs_scores['UTR_id'].map(index_to_name)

add input sequence


In [17]:
index_to_input_seq = dict(zip(range(100), seqs))

seqs_scores['Starting_Seq'] = seqs_scores['UTR_id'].map(index_to_input_seq)

add initial measured growth rate


In [18]:
index_to_measured_growth = dict(zip(range(100), selected_native_UTRs_data.growth_rate.values))

seqs_scores['Round0_Measured_Growth'] = seqs_scores['UTR_id'].map(index_to_measured_growth)

Save data


In [19]:
seqs_scores.to_csv(results_dir + 'Native_Evolved_Seqs.csv')

__ ___ __

Make Super UTRs from random library:

Here, we just go through all the same steps we just performed with the native library

Load data


In [20]:
random_data = pd.read_csv('../Data/Random_UTRs.csv', index_col=0)

As with the native library, we just want the UTRs that we're most confident about


In [21]:
random_data = random_data[random_data.t0 > 100]

In [22]:
# define the range of growth rates
desired_growth_rates = np.linspace(random_data.growth_rate.min(),
                                   random_data.growth_rate.max(),
                                   100) # pay attention to this last number, we're choosing it just based on how
                                        # many sequences we get out at the end. As you may have noticed, we chose
                                        # 106 in the case of the native UTRs

selected_random_UTRs = []

for i in range(len(desired_growth_rates)):
    
    # with each step we select the UTR with the growth rate closest to the desired growth rate
    selected_random_UTRs.append(abs(random_data.growth_rate - desired_growth_rates[i]).argmin())
    
# just take the unique ones
selected_random_UTRs = pd.Series(np.unique(selected_random_UTRs))

In [23]:
selected_random_UTRs_data = random_data.loc[selected_random_UTRs]

In [24]:
# make an array of the UTRs that we want to evolve
seqs = selected_random_UTRs_data.UTR.values

# run them through our evolve_seqs function for 40 rounds
mut_random_seqs = evolve_seqs(seqs, rounds = 40)


0 -0.234902635217

Organize the data

reshape and combine the dataframes with the sequences and the predicted growth rates


In [25]:
# reshape the dataframes. Transpose, then convert into a Multiindex, then convert the index into new columns
ran_seqs_df = mut_random_seqs[0].T.unstack().reset_index()
ran_scores_df = mut_random_seqs[1].T.unstack().reset_index()

# rename the columns so that the names are somewhat descriptive
ran_seqs_df.rename(columns = {'level_0' : 'UTR_id', 'level_1' : 'Round', 0 : 'Evolved_Seq'}, inplace = True)
ran_scores_df.rename(columns = {'level_0' : 'UTR_id', 'level_1' : 'Round', 0 : 'Pred_Growth'}, inplace = True)

# merge two datasets, only include columns that aren't redundant
cols_to_use = ran_scores_df.columns.difference(ran_seqs_df.columns) # identify non-redundant columns
ran_seqs_scores = pd.merge(ran_seqs_df, ran_scores_df[cols_to_use], right_index = True, left_index = True)

add input sequence


In [26]:
index_to_input_seq = dict(zip(range(100), seqs))

ran_seqs_scores['Starting_Seq'] = ran_seqs_scores['UTR_id'].map(index_to_input_seq)

add initial measured growth rate


In [27]:
index_to_measured_growth = dict(zip(range(100), selected_random_UTRs_data.growth_rate.values))

ran_seqs_scores['Round0_Measured_Growth'] = ran_seqs_scores['UTR_id'].map(index_to_measured_growth)

Save data


In [28]:
ran_seqs_scores.to_csv(results_dir + 'Random_Evolved_Seqs.csv')