Concise extends keras (https://keras.io) by providing additional layers, intializers, regularizers, metrics and preprocessors, suited for modelling genomic cis-regulatory elements.
In [57]:
## Concise extensions of keras:
import concise
import concise.layers as cl
import concise.initializers as ci
import concise.regularizers as cr
import concise.metrics as cm
from concise.preprocessing import encodeDNA, encodeSplines
from concise.data import attract, encode
## layers:
cl.ConvDNA
cl.ConvDNAQuantitySplines
cr.GAMRegularizer
cl.GAMSmooth
cl.GlobalSumPooling1D
cl.InputDNA
cl.InputSplines
## initializers:
ci.PSSMKernelInitializer
ci.PSSMBiasInitializer
## regularizers
cr.GAMRegularizer
## metrics
cm.var_explained
## Preprocessing
encodeDNA
encodeSplines
## Known motifs
attract
encode
Out[57]:
In [63]:
attract.get_pwm_list([519])[0].plotPWM()
It also implements a PWM class, used by PWM*Initializers
In [10]:
from concise.utils import PWM
PWM
Out[10]:
In this notebook, we will replicate the results from Plositional_effect/Simulation/01_fixed_seq_len.html using concise models. Please have a look at that notebook first.
In [1]:
# Used additional packages
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
## required keras modules
from keras.models import Model, load_model
import keras.layers as kl
import keras.optimizers as ko
In [59]:
# Load the data
data_dir = "../data/"
dt = pd.read_csv(data_dir + "/01-fixed_seq_len-1.csv")
motifs = ["TTAATGA"]
In [13]:
dt.head
Out[13]:
In [30]:
x_seq = encodeDNA(dt["seq"])
y = dt.y.as_matrix()
In [32]:
x_seq.shape # (n_samples, seq_length, n_bases)
Out[32]:
Concise is a thin wrapper around keras. To know more about keras, read the documentation: https://keras.io/.
In this tutorial, I'll be using the functional API of keras: https://keras.io/getting-started/functional-api-guide/. Feel free to use Concise with the sequential models.
In [33]:
## Parameters
seq_length = x_seq.shape[1]
## Motifs used to initialize the model
pwm_list = [PWM.from_consensus(motif) for motif in motifs]
motif_width = 7
pwm_list
Out[33]:
In [60]:
pwm_list[0].plotPWM()
In [34]:
np.random.seed(42)
# specify the input shape
input_dna = cl.InputDNA(seq_length)
# convolutional layer with filters initialized on a PWM
x = cl.ConvDNA(filters=1,
kernel_size=motif_width, ## motif width
activation="relu",
kernel_initializer=ci.PSSMKernelInitializer(pwm_list),
bias_initializer=ci.PSSMBiasInitializer(pwm_list,kernel_size=motif_width, mean_max_scale=1)
## mean_max_scale of 1 means that only consensus sequence gets score larger than 0
)(input_dna)
## Smoothing layer - positional-dependent effect
# output = input * (1+ pos_effect)
x = cl.GAMSmooth(n_bases=10, l2_smooth=1e-3, l2=0)(x)
x = cl.GlobalSumPooling1D()(x)
x = kl.Dense(units=1,activation="linear")(x)
model = Model(inputs=input_dna, outputs=x)
# compile the model
model.compile(optimizer="adam", loss="mse", metrics=[cm.var_explained])
In [35]:
## TODO - create a callback
from keras.callbacks import EarlyStopping
model.fit(x=x_seq, y=y, epochs=30, verbose=2,
callbacks=[EarlyStopping(patience=5)],
validation_split=.2
)
Out[35]:
In [36]:
model.save("/tmp/model.h5") ## requires h5py pacakge, pip install h5py
In [37]:
%ls -la /tmp/model*
In [38]:
model2 = load_model("/tmp/model.h5")
model2
Out[38]:
In [39]:
var_expl_history = model.history.history['val_var_explained']
plt.plot(var_expl_history)
plt.ylabel('Variance explained')
plt.xlabel('Epoch')
plt.title("Loss history")
Out[39]:
In [41]:
y_pred = model.predict(x_seq)
plt.scatter(y_pred, y)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("True vs predicted")
Out[41]:
Plot is the same as in the original motifp report.
In [42]:
# layers in the model
model.layers
Out[42]:
In [43]:
## Convenience functions in layers
gam_layer = model.layers[2]
gam_layer.plot()
plt.title("Positional effect")
Out[43]:
In [44]:
# Compare the curve to the theoretical
positions = gam_layer.positional_effect()["positions"]
pos_effect = gam_layer.positional_effect()["positional_effect"]
from scipy.stats import norm
pef = lambda x: 0.3*norm.pdf(x, 0.2, 0.1) + 0.05*np.sin(15*x) + 0.8
pos_effect_theoretical = pef(positions / positions.max())
# plot
plt.plot(positions, pos_effect, label="infered")
plt.plot(positions, pos_effect_theoretical, label="theoretical")
plt.ylabel('Positional effect')
plt.xlabel('Position')
plt.title("Positional effect")
plt.legend()
Out[44]:
Qualitatively, the curves are the same, quantitatively, they differ as the scale is modulated by other parameters in the model. Plot is similar to the original motifp report.
In [48]:
# plot the filters
model.layers[1].plot_weights(plot_type="motif_raw")
model.layers[1].plot_weights(plot_type="motif_pwm")
model.layers[1].plot_weights(plot_type="motif_pwm_info")
model.layers[1].plot_weights(plot_type="heatmap")
Out[48]:
In [49]:
dt = pd.read_csv(data_dir + "/01-fixed_seq_len-2.csv")
motifs = ["TTAATGA", "TATTTAT"]
In [51]:
## Parameters
seq_length = x_seq.shape[1]
## Motifs used to initialize the model
pwm_list = [PWM.from_consensus(motif) for motif in motifs]
motif_width = 7
pwm_list
Out[51]:
In [52]:
np.random.seed(1)
input_dna = cl.InputDNA(seq_length)
# convolutional layer with filters initialized on a PWM
x = cl.ConvDNA(filters=2,
kernel_size=motif_width, ## motif width
activation="relu",
kernel_initializer=ci.PWMKernelInitializer(pwm_list, stddev=0.0),
bias_initializer=ci.PWMBiasInitializer(pwm_list,kernel_size=motif_width, mean_max_scale=0.999),
## mean_max_scale of 1 means that only consensus sequence gets score larger than 0
trainable=False,
)(input_dna)
## Smoothing layer - positional-dependent effect
x = cl.GAMSmooth(n_bases=10, l2_smooth=1e-6, l2=0)(x)
x = cl.GlobalSumPooling1D()(x)
x = kl.Dense(units=1,activation="linear")(x)
model = Model(inputs=input_dna, outputs=x)
# compile the model
model.compile(optimizer=ko.Adam(lr=0.01), loss="mse", metrics=[cm.var_explained])
In [53]:
x_seq = encodeDNA(dt["seq"])
y = dt.y.as_matrix()
In [54]:
model.fit(x=x_seq, y=y, epochs=100, verbose = 2,
callbacks=[EarlyStopping(patience=5)],
validation_split=.2
)
Out[54]:
In [55]:
y_pred = model.predict(x_seq)
plt.scatter(y_pred, y)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("True vs predicted")
Out[55]:
In [56]:
## TODO - update to the new synthax
gam_layer = model.layers[2]
positions = gam_layer.positional_effect()["positions"]
pos_effect = gam_layer.positional_effect()["positional_effect"]
## Theoretical plot - from the original simulation data
from scipy.stats import norm
# https://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.stats.norm.html#scipy.stats.norm
pef1 = lambda x: 0.3*norm.pdf(x, 0.2, 0.1) + 0.05*np.sin(15*x) + 0.8
pos_effect_theoretical1 = pef1(positions / positions.max())
pef2 = lambda x: 0.3*norm.pdf(x, 0.35, 0.1) + 0.05*np.sin(15*x) + 0.8
pos_effect_theoretical2 = pef2(positions / positions.max())
w_motifs = model.get_weights()[-2]
b = model.get_weights()[-1]
## Create a new plot
pos_effect_calibrated = (pos_effect / np.transpose(w_motifs))/ 0.8
plt.plot(positions, pos_effect_calibrated[:,0], label="infered " + motifs[0])
plt.plot(positions, pos_effect_calibrated[:,1], label="infered " + motifs[1])
plt.plot(positions, pos_effect_theoretical1, label="theoretical " + motifs[0])
plt.plot(positions, pos_effect_theoretical2, label="theoretical " + motifs[1])
plt.ylabel('Positional effect')
plt.xlabel('Position')
plt.title("Positional effect")
plt.legend()
Out[56]: