In [1]:
import time
import glob
import tensorflow as tf
import numpy as np
import pandas as pd
import keras
import keras.backend as K
from keras import initializers
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras import optimizers
from keras.utils.np_utils import to_categorical
from utils.input_pipeline import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer, Normalizer
from keras.objectives import kullback_leibler_divergence

from sklearn.pipeline import Pipeline

imputer = Imputer()
normalizer = Normalizer()
pre_processing_pipeline = Pipeline([('imputer', imputer), ('normalizer', normalizer)])


Using TensorFlow backend.

In [2]:
#load_data_t0 = time.clock()
#df = pd.concat([pd.read_csv(filename, index_col=[1,0], na_values=['na'], engine='c', header=0) for filename in glob.glob("data/parser_output/csv/*.csv")],axis=0)
#df = pd.read_csv("data/parser_output/csv/new_mol2_full_feature_-017.csv", index_col=[1,0], na_values=['na'], engine='c',header=0)
#load_data_t1 = time.clock()
#print ("data loaded in ~", ((load_data_t1 - load_data_t0)/60), "minutes.")

In [3]:
from utils.input_pipeline import load_protein

In [4]:
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [5]:
with open("data/preprocessed_features.csv", "r") as input_file:
    feature_list = []
    for line in input_file:
        line = line.strip('\n')
        feature_list.append(line)
        
print(len(feature_list))


890

In [6]:
# the generator is probably the only valuable thing in this notebook, put this in the input pipeline module
def data_gen(file_path, batch_steps,categorical=False, sample_size=None, features_list=None, mode=None, conformation=None):
    #decide upon receptor versus protein for naming conventions
    receptor_list = list(h5py.File(file_path,'r'))
    while(1):
        random.shuffle(receptor_list)
        
        X,y = load_protein(file_path, protein_name=receptor_list[0], sample_size=None,
                               features_list=features_list,mode=mode, conformation=conformation)
        X = Normalizer().fit_transform(Imputer(strategy="median").fit_transform(np.nan_to_num(X)))
        y = y.flatten()
        
        positives = X[y==1,:]
        negatives = X[y==0,:]
        for step in range(batch_steps):
            negatives_to_keep = np.random.choice(negatives.shape[0],sample_size,replace = True)

            X_batch = np.vstack((negatives[negatives_to_keep],positives))
            X_batch = np.vstack((X_batch,positives))
            y_batch = np.hstack((y[y==0][negatives_to_keep],y[y==1]))
            y_batch = np.hstack((y_batch,y[y==1]))
            if categorical is True:
                yield X_batch, to_categorical(y_batch)
            else:
                yield X_batch, y_batch
 
#using for debugging purposes
#next(data_gen("data/full_26_kinase_data.h5", 10))

def precision(y_true, y_pred):
    """Precision metric.
    Only computes a batch-wise average of precision.
    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    y_true = K.cast(K.argmax(y_true),'float32')
    y_pred = K.cast(K.argmax(y_pred), 'float32')
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.
    Only computes a batch-wise average of recall.
    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    y_true = K.cast(K.argmax(y_true),'float32')
    y_pred = K.cast(K.argmax(y_pred), 'float32')
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return K.cast(recall,'float32')

def f1(y_true,y_pred):
    y_true = K.cast(K.argmax(y_true),'float32')
    y_pred = K.cast(K.argmax(y_pred), 'float32')
    return K.cast(2*((precision(y_true,y_pred)*recall(y_true,y_pred))/
                     (precision(y_true,y_pred)+recall(y_true,y_pred))),'float32')


def load_myloss(weights=None):
    if weights is None:
        class_weights = [0.25, 1]
    else:
        class_weights = weights

    def balanced_loss(y_true, y_pred):

        loss_prelim = K.categorical_crossentropy(y_true, y_pred)

        weight = K.cast(K.sum(y_true * class_weights), 'float32')

        # apply weight and average 
        loss_final = K.cast(K.mean(loss_prelim * weight), 'float32')

        return loss_final

    return balanced_loss


def my_loss():
    
    def custom_loss(y_true,y_pred):
        #kl_loss = kullback_leibler_divergence(y_true,y_pred)
        #total_loss = kullback_leibler_divergence(y_pred,y_true) + kl_loss
        #return total_loss
        return K.log(-K.dot(y_true,K.transpose(y_pred)))
        
        
    return custom_loss

In [12]:
X_train = np.loadtxt("data/random_forest_features_x_train.txt",delimiter=",",dtype=np.float32)
X_test = np.loadtxt("data/random_forest_features_x_test.txt",delimiter=",", dtype=np.float32)
y_train = np.loadtxt("data/random_forest_features_y_train.txt",delimiter=",",dtype=np.float32)
y_test = np.loadtxt("data/random_forest_features_y_test.txt",delimiter=",",dtype=np.float32)

In [58]:
X_train_pos = X_train[y_train == 1]
X_train_neg = X_train[y_train == 0]
balanced_X_train = np.random.choice(np.arange(int(np.floor(X_train_neg.shape[0]/2))),size=X_train_pos.shape[0])
X_train_neg = X_train_neg[balanced_X_train]
X_train_prime = np.vstack((X_train_pos,X_train_neg))
y_train_prime = np.hstack((y_train[y_train==1],y_train[y_train==0][balanced_X_train]))
print(X_train_prime.shape,y_train_prime.shape)


(14884, 1260) (14884,)

In [59]:
model = Sequential()
model.add(Dense(400, input_dim=1260, kernel_regularizer=keras.regularizers.l2(0.)))  
model.add(PReLU())
model.add(Dense(50))
model.add(PReLU())
model.add(Dense(1,activation='sigmoid'))

In [60]:
model.compile(loss='binary_crossentropy', optimizer=optimizers.adam(lr=1e-5), metrics=["accuracy",f1])

In [63]:
model.fit(X_train_prime,y_train_prime, shuffle=True,epochs=1000,verbose=0)


Out[63]:
<keras.callbacks.History at 0x2abc53666400>

In [64]:
from sklearn.metrics import accuracy_score, f1_score
f1 = f1_score(y_test,model.predict(X_test))
acc = accuracy_score(y_test,model.predict(X_test),y_test)
print("accuracy:",acc,"\tf1-score:",f1)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-64-307d8d4e5ffa> in <module>()
      1 from sklearn.metrics import accuracy_score, f1_score
----> 2 f1 = f1_score(y_test,model.predict(X_test))
      3 acc = accuracy_score(y_test,model.predict(X_test),y_test)
      4 print("accuracy:",acc,"\tf1-score:",f1)

/global/common/cori/software/python/3.5-anaconda/envs/deeplearning/lib/python3.5/site-packages/sklearn/metrics/classification.py in f1_score(y_true, y_pred, labels, pos_label, average, sample_weight)
    690     return fbeta_score(y_true, y_pred, 1, labels=labels,
    691                        pos_label=pos_label, average=average,
--> 692                        sample_weight=sample_weight)
    693 
    694 

/global/common/cori/software/python/3.5-anaconda/envs/deeplearning/lib/python3.5/site-packages/sklearn/metrics/classification.py in fbeta_score(y_true, y_pred, beta, labels, pos_label, average, sample_weight)
    804                                                  average=average,
    805                                                  warn_for=('f-score',),
--> 806                                                  sample_weight=sample_weight)
    807     return f
    808 

/global/common/cori/software/python/3.5-anaconda/envs/deeplearning/lib/python3.5/site-packages/sklearn/metrics/classification.py in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)
   1001         raise ValueError("beta should be >0 in the F-beta score")
   1002 
-> 1003     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
   1004     present_labels = unique_labels(y_true, y_pred)
   1005 

/global/common/cori/software/python/3.5-anaconda/envs/deeplearning/lib/python3.5/site-packages/sklearn/metrics/classification.py in _check_targets(y_true, y_pred)
     80     if len(y_type) > 1:
     81         raise ValueError("Can't handle mix of {0} and {1}"
---> 82                          "".format(type_true, type_pred))
     83 
     84     # We can't have more than one value on y_type => The set is no more needed

ValueError: Can't handle mix of binary and continuous

In [ ]:
#model.fit_generator(data_gen('data/full_26_kinase_data.h5',categorical=True,
#                             sample_size=1000, batch_steps=20000, features_list=feature_list),epochs=10,steps_per_epoch=20000)

In [ ]:
#from sklearn.metrics import accuracy_score, f1_score
#preds = model.predict(X_test)

#print("accuracy:",accuracy_score(preds,y_test), "\t","f1-score:",f1_score(preds,y_test))

In [ ]: