In the scenario where data is scarse, it is often useful to initialize the filters of the first convolutional layer to some known position weights matrices (PWM's). That way, the model already starts with a parameter configuration much closer to the 'right' one.
Concise provides access to 2 PWM databases:
Each PWM database is provided as a module under concise.data
. It provides two functions:
concise.data.<db>.get_metadata()
- returns a pandas.DataFrame with metadata information about each PWM concise.data.<db>.get_pwm_list()
- given a list of PWM ids, return a list with concise.utils.pwm.PWM
instances
In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
In [4]:
# RBP PWM's
from concise.data import attract
dfa = attract.get_metadata()
dfa
Out[4]:
In [5]:
# TF PWM's
from concise.data import encode
dfe = encode.get_metadata()
dfe
Out[5]:
In [6]:
# TF PWM's
from concise.data import hocomoco
dfh = hocomoco.get_metadata()
dfh
Out[6]:
Let's choose PUM2 PWM (RBP in Human):
In [7]:
dfa_pum2 = dfa[dfa.Gene_name.str.match("PUM2") & \
dfa.Organism.str.match("Homo_sapiens")]
dfa_pum2
Out[7]:
The PWM
class provides a method plotPWM
to visualize the PWM.
In [8]:
# Visualize the PUM2 Motifs from different experiments
from concise.utils.pwm import PWM
dfa_pum2_uniq = dfa_pum2[["Experiment_description", "PWM_id"]].drop_duplicates()
pwm_list = attract.get_pwm_list(dfa_pum2_uniq.PWM_id)
In [9]:
for i, pwm in enumerate(pwm_list):
print("PWM_id:", pwm.name, "; Experiment_description:", dfa_pum2_uniq.Experiment_description.iloc[i])
pwm.plotPWM(figsize=(3,1))
We can select the PWM with id 129.
In [10]:
pwm_list = [pwm for pwm in pwm_list if pwm.name == "129"]
In [11]:
pwm_list
Out[11]:
In [12]:
import concise.layers as cl
import keras.layers as kl
import concise.initializers as ci
import concise.regularizers as cr
from keras.callbacks import EarlyStopping
from concise.preprocessing import encodeDNA
from keras.models import Model, load_model
from keras.optimizers import Adam
In [13]:
# get the data
def load(split="train", st=None):
dt = pd.read_csv("../data/RBP/PUM2_{0}.csv".format(split))
# DNA/RNA sequence
xseq = encodeDNA(dt.seq) # list of sequences -> np.ndarray
# response variable
y = dt.binding_site.as_matrix().reshape((-1, 1)).astype("float")
return {"seq": xseq}, y
train, valid, test = load("train"), load("valid"), load("test")
# deduce sequence length
seq_length = train[0]["seq"].shape[1]
In [14]:
# define the model
def model(train, filters=1, kernel_size=9, pwm_list=None, lr=0.001):
seq_length = train[0]["seq"].shape[1]
if pwm_list is None:
kinit = "glorot_uniform"
binit = "zeros"
else:
kinit = ci.PSSMKernelInitializer(pwm_list, add_noise_before_Pwm2Pssm=True)
binit = "zeros"
# sequence
in_dna = cl.InputDNA(seq_length=seq_length, name="seq")
x = cl.ConvDNA(filters=filters,
kernel_size=kernel_size,
activation="relu",
kernel_initializer=kinit,
bias_initializer=binit,
name="conv1")(in_dna)
x = kl.AveragePooling1D(pool_size=4)(x)
x = kl.Flatten()(x)
x = kl.Dense(units=1)(x)
m = Model(in_dna, x)
m.compile(Adam(lr=lr), loss="binary_crossentropy", metrics=["acc"])
return m
ci.PSSMKernelInitializer
will set the filters of the first convolutional layer to the values of the position-specific scoring matrix (PSSM):
where $b_j$ is the background probability of observing base $j$.
We add gaussian noise to each individual filter. Let's visualize the filters:
In [15]:
# create two models: with and without PWM initialization
m_rand_init = model(train, filters=3, pwm_list=None) # random initialization
m_pwm_init = model(train, filters=3, pwm_list=pwm_list) # motif initialization
In [17]:
print("Random initialization:")
m_rand_init.get_layer("conv1").plot_weights(figsize=(3, 5));
In [18]:
print("Known PWM initialization:")
m_pwm_init.get_layer("conv1").plot_weights(figsize=(3, 5));
In [20]:
# train the models
m_rand_init.fit(train[0], train[1], epochs=50, validation_data=valid,
verbose=0,
callbacks=[EarlyStopping(patience=5)])
Out[20]:
In [22]:
m_pwm_init.fit(train[0], train[1], epochs=50, validation_data=valid,
verbose=0,
callbacks=[EarlyStopping(patience=5)]);
In [23]:
import concise.eval_metrics as cem
In [24]:
# performance on the test-set
# Random initialization
print("Random intiailzation auPR:", cem.auprc(test[1], m_rand_init.predict(test[0])))
# PWM initialization
print("Known PWM initialization auPR:", cem.auprc(test[1], m_pwm_init.predict(test[0])))
In [25]:
m_rand_init.get_layer("conv1").plot_weights(plot_type="motif_pwm_info", figsize=(3, 5));
In [26]:
m_pwm_init.get_layer("conv1").plot_weights(plot_type="motif_pwm_info", figsize=(3, 5));