In [1]:
import time
import glob
import tensorflow as tf
import numpy as np
import pandas as pd
import keras
import keras.backend as K
from keras import initializers
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras import optimizers
from keras.utils.np_utils import to_categorical
from utils.input_pipeline import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer, Normalizer
from keras.objectives import kullback_leibler_divergence
from sklearn.pipeline import Pipeline
imputer = Imputer()
normalizer = Normalizer()
pre_processing_pipeline = Pipeline([('imputer', imputer), ('normalizer', normalizer)])
In [2]:
#load_data_t0 = time.clock()
#df = pd.concat([pd.read_csv(filename, index_col=[1,0], na_values=['na'], engine='c', header=0) for filename in glob.glob("data/parser_output/csv/*.csv")],axis=0)
#df = pd.read_csv("data/parser_output/csv/new_mol2_full_feature_-017.csv", index_col=[1,0], na_values=['na'], engine='c',header=0)
#load_data_t1 = time.clock()
#print ("data loaded in ~", ((load_data_t1 - load_data_t0)/60), "minutes.")
In [3]:
from utils.input_pipeline import load_protein
In [4]:
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
In [5]:
with open("data/preprocessed_features.csv", "r") as input_file:
feature_list = []
for line in input_file:
line = line.strip('\n')
feature_list.append(line)
print(len(feature_list))
In [6]:
# the generator is probably the only valuable thing in this notebook, put this in the input pipeline module
def data_gen(file_path, batch_steps,categorical=False, sample_size=None, features_list=None, mode=None, conformation=None):
#decide upon receptor versus protein for naming conventions
receptor_list = list(h5py.File(file_path,'r'))
while(1):
random.shuffle(receptor_list)
X,y = load_protein(file_path, protein_name=receptor_list[0], sample_size=None,
features_list=features_list,mode=mode, conformation=conformation)
X = Normalizer().fit_transform(Imputer(strategy="median").fit_transform(np.nan_to_num(X)))
y = y.flatten()
positives = X[y==1,:]
negatives = X[y==0,:]
for step in range(batch_steps):
negatives_to_keep = np.random.choice(negatives.shape[0],sample_size,replace = True)
X_batch = np.vstack((negatives[negatives_to_keep],positives))
X_batch = np.vstack((X_batch,positives))
y_batch = np.hstack((y[y==0][negatives_to_keep],y[y==1]))
y_batch = np.hstack((y_batch,y[y==1]))
if categorical is True:
yield X_batch, to_categorical(y_batch)
else:
yield X_batch, y_batch
#using for debugging purposes
#next(data_gen("data/full_26_kinase_data.h5", 10))
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
y_true = K.cast(K.argmax(y_true),'float32')
y_pred = K.cast(K.argmax(y_pred), 'float32')
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
y_true = K.cast(K.argmax(y_true),'float32')
y_pred = K.cast(K.argmax(y_pred), 'float32')
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return K.cast(recall,'float32')
def f1(y_true,y_pred):
y_true = K.cast(K.argmax(y_true),'float32')
y_pred = K.cast(K.argmax(y_pred), 'float32')
return K.cast(2*((precision(y_true,y_pred)*recall(y_true,y_pred))/
(precision(y_true,y_pred)+recall(y_true,y_pred))),'float32')
def load_myloss(weights=None):
if weights is None:
class_weights = [0.25, 1]
else:
class_weights = weights
def balanced_loss(y_true, y_pred):
loss_prelim = K.categorical_crossentropy(y_true, y_pred)
weight = K.cast(K.sum(y_true * class_weights), 'float32')
# apply weight and average
loss_final = K.cast(K.mean(loss_prelim * weight), 'float32')
return loss_final
return balanced_loss
def my_loss():
def custom_loss(y_true,y_pred):
#kl_loss = kullback_leibler_divergence(y_true,y_pred)
#total_loss = kullback_leibler_divergence(y_pred,y_true) + kl_loss
#return total_loss
return K.log(-K.dot(y_true,K.transpose(y_pred)))
return custom_loss
In [12]:
X_train = np.loadtxt("data/random_forest_features_x_train.txt",delimiter=",",dtype=np.float32)
X_test = np.loadtxt("data/random_forest_features_x_test.txt",delimiter=",", dtype=np.float32)
y_train = np.loadtxt("data/random_forest_features_y_train.txt",delimiter=",",dtype=np.float32)
y_test = np.loadtxt("data/random_forest_features_y_test.txt",delimiter=",",dtype=np.float32)
In [58]:
X_train_pos = X_train[y_train == 1]
X_train_neg = X_train[y_train == 0]
balanced_X_train = np.random.choice(np.arange(int(np.floor(X_train_neg.shape[0]/2))),size=X_train_pos.shape[0])
X_train_neg = X_train_neg[balanced_X_train]
X_train_prime = np.vstack((X_train_pos,X_train_neg))
y_train_prime = np.hstack((y_train[y_train==1],y_train[y_train==0][balanced_X_train]))
print(X_train_prime.shape,y_train_prime.shape)
In [59]:
model = Sequential()
model.add(Dense(400, input_dim=1260, kernel_regularizer=keras.regularizers.l2(0.)))
model.add(PReLU())
model.add(Dense(50))
model.add(PReLU())
model.add(Dense(1,activation='sigmoid'))
In [60]:
model.compile(loss='binary_crossentropy', optimizer=optimizers.adam(lr=1e-5), metrics=["accuracy",f1])
In [63]:
model.fit(X_train_prime,y_train_prime, shuffle=True,epochs=1000,verbose=0)
Out[63]:
In [64]:
from sklearn.metrics import accuracy_score, f1_score
f1 = f1_score(y_test,model.predict(X_test))
acc = accuracy_score(y_test,model.predict(X_test),y_test)
print("accuracy:",acc,"\tf1-score:",f1)
In [ ]:
#model.fit_generator(data_gen('data/full_26_kinase_data.h5',categorical=True,
# sample_size=1000, batch_steps=20000, features_list=feature_list),epochs=10,steps_per_epoch=20000)
In [ ]:
#from sklearn.metrics import accuracy_score, f1_score
#preds = model.predict(X_test)
#print("accuracy:",accuracy_score(preds,y_test), "\t","f1-score:",f1_score(preds,y_test))
In [ ]: