In [1]:
    
from concise.data import attract
dfa = attract.get_metadata()
dfa
    
    
    Out[1]:
RBPs to check
In [2]:
    
dfa_pum2 = dfa[dfa.Gene_name.str.match("PUM2") & \
               dfa.Organism.str.match("Homo_sapiens") & \
               (dfa.Database != "PDB") & \
               (dfa.Experiment_description == "genome-wide in vivo immunoprecipitation")]
    
In [3]:
    
dfa_pum2
    
    Out[3]:
In [4]:
    
from concise.utils.pwm import PWM
pwm_list = attract.get_pwm_list(dfa_pum2.PWM_id.unique())
    
In [5]:
    
for pwm in pwm_list:
    pwm.plotPWM(figsize=(8,1.5))
    
    
In [6]:
    
import pandas as pd
from concise.preprocessing import encodeDNA, EncodeSplines
    
In [8]:
    
def load(split="train", st=None):
    dt = pd.read_csv("../data/RBP/PUM2_{0}.csv".format(split))
    # DNA/RNA sequence
    xseq = encodeDNA(dt.seq) 
    # distance to the poly-A site
    xpolya = dt.polya_distance.as_matrix().reshape((-1, 1))
    # response variable
    y = dt.binding_site.as_matrix().reshape((-1, 1)).astype("float")
    return {"seq": xseq, "dist_polya_raw": xpolya}, y
def data():
    
    train, valid, test = load("train"), load("valid"), load("test")
    
    # transform the poly-A distance with B-splines
    es = EncodeSplines()
    es.fit(train[0]["dist_polya_raw"])
    train[0]["dist_polya_st"] = es.transform(train[0]["dist_polya_raw"])
    valid[0]["dist_polya_st"] = es.transform(valid[0]["dist_polya_raw"])
    test[0]["dist_polya_st"] = es.transform(test[0]["dist_polya_raw"])
    
    #return load("train"), load("valid"), load("test")
    return train, valid, test
train, valid, test = data()
    
    
In [9]:
    
import concise.layers as cl
import keras.layers as kl
import concise.initializers as ci
import concise.regularizers as cr
from keras.optimizers import Adam
from keras.models import Model
from keras.callbacks import EarlyStopping
    
In [10]:
    
train[0]["seq"].shape
    
    Out[10]:
In [11]:
    
train[0].keys()
    
    Out[11]:
In [19]:
    
def model(train, filters=1, kernel_size=9, pwm_list=None, lr=0.001, use_splinew=True, ext_dist=False):
    seq_length = train[0]["seq"].shape[1]
    if pwm_list is None:
        kinit = "glorot_uniform"
        binit = "zeros"
    else:
        kinit = ci.PSSMKernelInitializer(pwm_list, add_noise_before_Pwm2Pssm=True)
        binit = "zeros"
        
    # sequence
    in_dna = cl.InputDNA(seq_length=seq_length, name="seq")
    inputs = [in_dna]
    x = cl.ConvDNA(filters=filters, 
                   kernel_size=kernel_size, 
                   activation="relu",
                   kernel_initializer=kinit,
                   bias_initializer=binit,
                   name="conv1")(in_dna)
    if use_splinew:
        x = cl.SplineWeight1D(n_bases=10, l2_smooth=0, l2=0, name="spline_weight")(x)
        x = kl.GlobalAveragePooling1D()(x)
    else:
        x = kl.AveragePooling1D(pool_size=4)(x)
        x = kl.Flatten()(x)
        
    if ext_dist:    
        # distance
        in_dist = kl.Input(train[0]["dist_polya_st"].shape[1:], name="dist_polya_st")
        x_dist = cl.SplineT()(in_dist)
        x = kl.concatenate([x, x_dist])
        inputs += [in_dist]
    
    x = kl.Dense(units=1)(x)
    m = Model(inputs, x)
    m.compile(Adam(lr=lr), loss="binary_crossentropy", metrics=["acc"])
    return m
    
In [20]:
    
m = model(train, filters=10, use_splinew=False, ext_dist=True)
    
In [21]:
    
m.fit(train[0], train[1], epochs=50, validation_data=valid, 
     callbacks=[EarlyStopping(patience=5)])
    
    
    Out[21]:
In [148]:
    
m.get_layer("conv1").plot_weights(plot_type="motif_pwm_info")
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [22]:
    
m = model(train, filters=16, pwm_list=pwm_list, use_splinew=False, ext_dist=True)
    
In [23]:
    
m.fit(train[0], train[1], epochs=50, validation_data=valid, 
     callbacks=[EarlyStopping(patience=5)])
    
    
    Out[23]:
In [166]:
    
m.get_layer("conv1").plot_weights(plot_type="motif_pwm_info")
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [149]:
    
# we can see that the performance is better using the initialized motif
    
In [122]:
    
m.get_layer("spline_weight").*?
    
In [123]:
    
m.get_layer("spline_weight").get_weights()
    
    Out[123]:
In [ ]: