In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers.convolutional import Convolution2D
from keras.optimizers import Adam
import keras
import theano
import os
In [2]:
results_dir = '../Results/EvolvedUTRs/'
if not os.path.exists(results_dir):
os.mkdir(results_dir)
fig_dir = '../Figures/EvolvedUTRs/'
if not os.path.exists(fig_dir):
os.mkdir(fig_dir)
In [3]:
model_name = 'Random_UTR_CNN'
model_params_dir = '../Results/{0}.Hyperparam.Opt/'.format(model_name)
Load model
In [4]:
model = keras.models.model_from_json(open(model_params_dir + 'model_arch.json').read())
model.load_weights(model_params_dir + 'model_weights.hdf5')
model.compile(loss='mean_squared_error', optimizer='adam')
Handy global variables:
In [5]:
bases = ['A','C','G','T']
base_dict = dict(zip(bases, range(4)))
One hot encoding of sequences:
In [6]:
def one_hot_encoding(seqs):
n = len(seqs)
total_width = 70
X = np.zeros((n, 1, 4, total_width))
for i in range(n):
seq = seqs[i]
for b in range(len(seq)):
X[i, 0, base_dict[seq[b]], 10 + 50 - len(seq) + b] = 1.
return X.astype(theano.config.floatX)
Make mutations in sequence:
In [7]:
def make_mutation(sequences, pos, base):
return [seq[:pos] + base + seq[pos + 1:] if pos < len(seq) else seq for seq in sequences]
Evolve sequences:
Here we make all possible mutations to a given sequence, ask the model to score each, and take the best one. We do this for 40 rounds.
In [8]:
def evolve_seqs(seqs, rounds = 5):
all_seqs = {}
all_scores = {}
all_seqs[0] = seqs
all_scores[0] = model.predict(one_hot_encoding(seqs)).flatten()
# march through every round of 'evolution'
for i in range(rounds):
Y_mut = {}
# loop through the four bases
for base in bases:
# we'll try each base at each position of the sequences
for position in range(50):
mut_seqs = make_mutation(seqs, position, base) # make mutated sequence
X_mut = one_hot_encoding(mut_seqs) # one-hot-encoding of mutated sequence
Y_mut[base + '_' + str(position)] = model.predict(X_mut).flatten() # ask the model what it thinks
# Note which mutations are the best performing for each UTR
best_muts = pd.DataFrame(Y_mut).apply(np.argmax, axis = 1).values
# Remake these mutations (an example value in best_muts looks would be "T_30", which tells us the nucleotide
# and the position)
seqs = [s[:int(m[2:])] + m[:1] + s[int(m[2:]) + 1:] for s, m in zip(seqs, best_muts)]
all_seqs[i + 1] = seqs
all_scores[i + 1] = model.predict(one_hot_encoding(seqs)).flatten()
# keep track of where we are
print i, pd.DataFrame(Y_mut).max(1).mean()
return pd.DataFrame(all_seqs), pd.DataFrame(all_scores)
In [9]:
native_data = pd.read_csv('../Data/Native_UTRs.csv', index_col = 0)
In [10]:
native_data = native_data[native_data.t0 > 100]
Since we have quite a lot of UTRs still, we also choose to focus on sequences that come from 5' UTR "chunks" that originate from proximal to the start codon. Recall that to test native 5' UTR sequences in our HIS3 growth assay, we broke the sequences into "chunks" of 50 nucleotides or less. Thus, some of the segments originate from sequence adjacent to the start codon, and others come from regions further upstream.
In [11]:
native_data = native_data.loc[native_data['UTR_name'].apply(lambda s: s.split(':')[-1]) == '0']
Select UTRs with a range of expressions -- we don't only want to try to improve the worst or the best ones
We do this by first making an array of 100 desired growth rates evenly spaced between the lowest and highest actual growth rates. To find a UTR that is closest to one of the desired growth rates, we take the absolute value of the difference between the UTR's growth rate and the desired growth rate. Then we take the UTR with the value closest to zero (i.e. it's growth rate is the closest to the desired rate).
In [12]:
# define the range of growth rates
desired_growth_rates = np.linspace(native_data.growth_rate.min(),
native_data.growth_rate.max(),
106)
selected_native_UTRs = []
for i in range(len(desired_growth_rates)):
# with each step we select the UTR with the growth rate closest to the desired growth rate
selected_native_UTRs.append(abs(native_data.growth_rate - desired_growth_rates[i]).argmin())
# just take the unique ones
selected_native_UTRs = pd.Series(np.unique(selected_native_UTRs))
In [13]:
selected_native_UTRs_data = native_data.loc[selected_native_UTRs]
In [14]:
# make an array of the UTRs that we want to evolve
seqs = selected_native_UTRs_data.UTR.values
# run them through our evolve_seqs function for 40 rounds
mut_native_seqs = evolve_seqs(seqs, rounds = 40)
reshape and combine the dataframes with the sequences and the predicted growth rates
In [15]:
# I apologize in advance, this is pretty unreadable
# reshape the dataframes. Transpose, then convert into a Multiindex, then convert the index into new columns
seqs_df = mut_native_seqs[0].T.unstack().reset_index()
scores_df = mut_native_seqs[1].T.unstack().reset_index()
# rename the columns so that the names are somewhat descriptive
seqs_df.rename(columns = {'level_0' : 'UTR_id', 'level_1' : 'Round', 0 : 'Evolved_Seq'}, inplace = True)
scores_df.rename(columns = {'level_0' : 'UTR_id', 'level_1' : 'Round', 0 : 'Pred_Growth'}, inplace = True)
# merge two datasets, only include columns that aren't redundant
cols_to_use = scores_df.columns.difference(seqs_df.columns) # identify non-redundant columns
seqs_scores = pd.merge(seqs_df, scores_df[cols_to_use], right_index = True, left_index = True)
add Gene/UTR name information
In [16]:
index_to_name = dict(zip(range(100), selected_native_UTRs_data.UTR_name.values))
seqs_scores['UTR_name'] = seqs_scores['UTR_id'].map(index_to_name)
add input sequence
In [17]:
index_to_input_seq = dict(zip(range(100), seqs))
seqs_scores['Starting_Seq'] = seqs_scores['UTR_id'].map(index_to_input_seq)
add initial measured growth rate
In [18]:
index_to_measured_growth = dict(zip(range(100), selected_native_UTRs_data.growth_rate.values))
seqs_scores['Round0_Measured_Growth'] = seqs_scores['UTR_id'].map(index_to_measured_growth)
In [19]:
seqs_scores.to_csv(results_dir + 'Native_Evolved_Seqs.csv')
In [20]:
random_data = pd.read_csv('../Data/Random_UTRs.csv', index_col=0)
In [21]:
random_data = random_data[random_data.t0 > 100]
In [22]:
# define the range of growth rates
desired_growth_rates = np.linspace(random_data.growth_rate.min(),
random_data.growth_rate.max(),
100) # pay attention to this last number, we're choosing it just based on how
# many sequences we get out at the end. As you may have noticed, we chose
# 106 in the case of the native UTRs
selected_random_UTRs = []
for i in range(len(desired_growth_rates)):
# with each step we select the UTR with the growth rate closest to the desired growth rate
selected_random_UTRs.append(abs(random_data.growth_rate - desired_growth_rates[i]).argmin())
# just take the unique ones
selected_random_UTRs = pd.Series(np.unique(selected_random_UTRs))
In [23]:
selected_random_UTRs_data = random_data.loc[selected_random_UTRs]
In [24]:
# make an array of the UTRs that we want to evolve
seqs = selected_random_UTRs_data.UTR.values
# run them through our evolve_seqs function for 40 rounds
mut_random_seqs = evolve_seqs(seqs, rounds = 40)
reshape and combine the dataframes with the sequences and the predicted growth rates
In [25]:
# reshape the dataframes. Transpose, then convert into a Multiindex, then convert the index into new columns
ran_seqs_df = mut_random_seqs[0].T.unstack().reset_index()
ran_scores_df = mut_random_seqs[1].T.unstack().reset_index()
# rename the columns so that the names are somewhat descriptive
ran_seqs_df.rename(columns = {'level_0' : 'UTR_id', 'level_1' : 'Round', 0 : 'Evolved_Seq'}, inplace = True)
ran_scores_df.rename(columns = {'level_0' : 'UTR_id', 'level_1' : 'Round', 0 : 'Pred_Growth'}, inplace = True)
# merge two datasets, only include columns that aren't redundant
cols_to_use = ran_scores_df.columns.difference(ran_seqs_df.columns) # identify non-redundant columns
ran_seqs_scores = pd.merge(ran_seqs_df, ran_scores_df[cols_to_use], right_index = True, left_index = True)
add input sequence
In [26]:
index_to_input_seq = dict(zip(range(100), seqs))
ran_seqs_scores['Starting_Seq'] = ran_seqs_scores['UTR_id'].map(index_to_input_seq)
add initial measured growth rate
In [27]:
index_to_measured_growth = dict(zip(range(100), selected_random_UTRs_data.growth_rate.values))
ran_seqs_scores['Round0_Measured_Growth'] = ran_seqs_scores['UTR_id'].map(index_to_measured_growth)
In [28]:
ran_seqs_scores.to_csv(results_dir + 'Random_Evolved_Seqs.csv')