In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
#set verbose to 0 so that it prints from the callback function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D,GlobalAveragePooling1D

In [3]:
#load data

print('Loading data...')

#4 datasets
#CYP1A2_fixpka.smi
#CYP2C19_substrate_fixpka.smi
#CYP2D6_fixpka.smi
#CYP3A4_fixpka.smi

datasetFile='CYP1A2_fixpka.smi'
dataset = pd.read_csv(datasetFile, sep= ' ', header=None)
X_SMILES, y = dataset.iloc[:,:-1], dataset.iloc[:, -1]

#Library of unique characters. The first element is not used for padding
char_lib=["ZMY"]

#Find characters of sequences and build a library
for index, SMILESsequence in X_SMILES.iterrows():
    for letter in SMILESsequence[0]:
        if not letter in char_lib:            
            char_lib.append(letter)

X=[]
#SMILES sequence to an array
for index, SMILESsequence in X_SMILES.iterrows():
    sequenceArray=[]
    for letter in SMILESsequence[0]:
        sequenceArray.append(char_lib.index(letter))
    X.append(sequenceArray)
    
print('data loaded')


Loading data...
data loaded

In [41]:
#add zeros
X=sequence.pad_sequences(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=428)

max_features = len(char_lib)
maxlen = X_train.shape[1]

In [5]:
print(X_train.shape)
print(max_features)
print(maxlen)


(11922, 226)
43
226

In [6]:
import numpy as np
from keras.callbacks import Callback
class CustomCallbacks(Callback):
    def __init__(self, filename="best_weight.hdf5", monitor='val_acc', patience=10):
        super(Callback, self).__init__()
        self.filename = filename
        self.monitor = monitor
        self.patience = patience

        self.wait = 0
        
        if 'acc' in self.monitor or 'val_acc' in self.monitor:
                self.monitor_op = np.greater
                self.best = -np.Inf
        else:
                self.monitor_op = np.less
                self.best = np.Inf
        self.losses = []
        self.acc = []
        self.val_acc =[]
        self.val_losses=[]             

    def on_epoch_end(self, epoch, logs={}):
        self.losses.append(logs.get('loss'))
        self.acc.append(logs.get('acc'))
        self.val_acc.append(logs.get('val_acc'))
        self.val_losses.append(logs.get('val_loss'))   
        print("acc: %0.3f" % logs.get('acc'), 
              ", loss: %0.3f" % logs.get('loss'),
              ", val_acc: %0.3f" % logs.get('val_acc'),
              ", val_loss: %0.3f" % logs.get('val_loss'))        
        
        current = logs.get(self.monitor)
        if current is None:
            warnings.warn('Can save best model only with %s available, '
                                  'skipping.' % (self.monitor), RuntimeWarning)
        else:
            if self.monitor_op(current, self.best):
                print('Epoch %05d: %s improved from %0.5f to %0.5f,'
                                  ' saving model to %s'
                                  % (epoch, self.monitor, self.best,
                                     current, self.filename))
                self.best = current                       
                self.wait = 0
                self.model.save(self.filename, overwrite=True)
            else:
                print('Epoch %05d: %s did not improve' %  (epoch, self.monitor))
                if self.wait >= self.patience:                   
                    self.model.stop_training = True
                self.wait += 1

    def on_train_end(self, logs=None):
        if self.wait >= self.patience:
            print('Warning: early stopping')
        self.wait=0
        if 'acc' in self.monitor or 'val_acc' in self.monitor:
                self.monitor_op = np.greater
                self.best = -np.Inf
        else:
                self.monitor_op = np.less
                self.best = np.In
        
my_callbacks=CustomCallbacks(filename="CYP1A2_conv1.hdf5", monitor='val_acc', patience=np.Inf)

In [15]:
from keras.layers.pooling import MaxPooling1D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.recurrent import LSTM 
from keras.layers import Flatten

In [57]:
model = Sequential()
model.add(Embedding(input_dim=max_features,output_dim=50, input_length=maxlen))
model.add(Dropout(0.5))

model.add(Conv1D(filters=500,kernel_size=7, padding='valid', strides=1))
model.add(LeakyReLU(0.3)) #better than relu with 0.004 more accuracy...
model.add(GlobalMaxPooling1D())
#model.add(MaxPooling1D(pool_size=2))


model.add(Dense(250))
model.add(Dropout(0.5))
model.add(LeakyReLU(0.3))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_23 (Embedding)     (None, 226, 50)           2150      
_________________________________________________________________
dropout_58 (Dropout)         (None, 226, 50)           0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 220, 500)          175500    
_________________________________________________________________
leaky_re_lu_44 (LeakyReLU)   (None, 220, 500)          0         
_________________________________________________________________
global_max_pooling1d_17 (Glo (None, 500)               0         
_________________________________________________________________
dense_41 (Dense)             (None, 250)               125250    
_________________________________________________________________
dropout_59 (Dropout)         (None, 250)               0         
_________________________________________________________________
leaky_re_lu_45 (LeakyReLU)   (None, 250)               0         
_________________________________________________________________
dense_42 (Dense)             (None, 1)                 251       
_________________________________________________________________
activation_21 (Activation)   (None, 1)                 0         
=================================================================
Total params: 303,151
Trainable params: 303,151
Non-trainable params: 0
_________________________________________________________________
None

In [ ]:
#data is not shuffled using validation_split. Have to use sk-learn to shuffle it first. 
model.fit(X_train, y_train,
          batch_size=100,
          epochs=100,
          callbacks=[my_callbacks],
          validation_split=0.2,
          verbose=0)


acc: 0.691 , loss: 0.582 , val_acc: 0.747 , val_loss: 0.524
Epoch 00000: val_acc improved from -inf to 0.74675, saving model to CYP1A2_conv1.hdf5
acc: 0.745 , loss: 0.520 , val_acc: 0.740 , val_loss: 0.522
Epoch 00001: val_acc did not improve
acc: 0.759 , loss: 0.502 , val_acc: 0.769 , val_loss: 0.496
Epoch 00002: val_acc improved from 0.74675 to 0.76855, saving model to CYP1A2_conv1.hdf5
acc: 0.763 , loss: 0.496 , val_acc: 0.769 , val_loss: 0.505
Epoch 00003: val_acc improved from 0.76855 to 0.76897, saving model to CYP1A2_conv1.hdf5
acc: 0.771 , loss: 0.482 , val_acc: 0.785 , val_loss: 0.478
Epoch 00004: val_acc improved from 0.76897 to 0.78532, saving model to CYP1A2_conv1.hdf5
acc: 0.781 , loss: 0.471 , val_acc: 0.790 , val_loss: 0.473
Epoch 00005: val_acc improved from 0.78532 to 0.79036, saving model to CYP1A2_conv1.hdf5
acc: 0.783 , loss: 0.462 , val_acc: 0.788 , val_loss: 0.486
Epoch 00006: val_acc did not improve
acc: 0.784 , loss: 0.461 , val_acc: 0.791 , val_loss: 0.476
Epoch 00007: val_acc improved from 0.79036 to 0.79119, saving model to CYP1A2_conv1.hdf5
acc: 0.792 , loss: 0.448 , val_acc: 0.795 , val_loss: 0.488
Epoch 00008: val_acc improved from 0.79119 to 0.79455, saving model to CYP1A2_conv1.hdf5
acc: 0.795 , loss: 0.446 , val_acc: 0.748 , val_loss: 0.551
Epoch 00009: val_acc did not improve
acc: 0.792 , loss: 0.448 , val_acc: 0.797 , val_loss: 0.457
Epoch 00010: val_acc improved from 0.79455 to 0.79748, saving model to CYP1A2_conv1.hdf5
acc: 0.800 , loss: 0.436 , val_acc: 0.795 , val_loss: 0.476
Epoch 00011: val_acc did not improve
acc: 0.803 , loss: 0.430 , val_acc: 0.789 , val_loss: 0.494
Epoch 00012: val_acc did not improve
acc: 0.813 , loss: 0.416 , val_acc: 0.777 , val_loss: 0.503
Epoch 00013: val_acc did not improve
acc: 0.809 , loss: 0.418 , val_acc: 0.743 , val_loss: 0.554
Epoch 00014: val_acc did not improve
acc: 0.812 , loss: 0.413 , val_acc: 0.806 , val_loss: 0.461
Epoch 00015: val_acc improved from 0.79748 to 0.80629, saving model to CYP1A2_conv1.hdf5
acc: 0.819 , loss: 0.403 , val_acc: 0.797 , val_loss: 0.460
Epoch 00016: val_acc did not improve
acc: 0.822 , loss: 0.401 , val_acc: 0.792 , val_loss: 0.477
Epoch 00017: val_acc did not improve
acc: 0.828 , loss: 0.391 , val_acc: 0.806 , val_loss: 0.452
Epoch 00018: val_acc improved from 0.80629 to 0.80629, saving model to CYP1A2_conv1.hdf5
acc: 0.827 , loss: 0.393 , val_acc: 0.803 , val_loss: 0.468
Epoch 00019: val_acc did not improve
acc: 0.821 , loss: 0.392 , val_acc: 0.797 , val_loss: 0.482
Epoch 00020: val_acc did not improve
acc: 0.821 , loss: 0.392 , val_acc: 0.799 , val_loss: 0.465
Epoch 00021: val_acc did not improve
acc: 0.833 , loss: 0.376 , val_acc: 0.783 , val_loss: 0.502
Epoch 00022: val_acc did not improve
acc: 0.827 , loss: 0.379 , val_acc: 0.787 , val_loss: 0.517
Epoch 00023: val_acc did not improve
acc: 0.840 , loss: 0.369 , val_acc: 0.808 , val_loss: 0.455
Epoch 00024: val_acc improved from 0.80629 to 0.80797, saving model to CYP1A2_conv1.hdf5
acc: 0.831 , loss: 0.374 , val_acc: 0.797 , val_loss: 0.468
Epoch 00025: val_acc did not improve
acc: 0.842 , loss: 0.363 , val_acc: 0.797 , val_loss: 0.487
Epoch 00026: val_acc did not improve
acc: 0.837 , loss: 0.366 , val_acc: 0.796 , val_loss: 0.463
Epoch 00027: val_acc did not improve
acc: 0.848 , loss: 0.355 , val_acc: 0.802 , val_loss: 0.484
Epoch 00028: val_acc did not improve
acc: 0.842 , loss: 0.356 , val_acc: 0.767 , val_loss: 0.561
Epoch 00029: val_acc did not improve
acc: 0.832 , loss: 0.370 , val_acc: 0.797 , val_loss: 0.500
Epoch 00030: val_acc did not improve
acc: 0.848 , loss: 0.346 , val_acc: 0.807 , val_loss: 0.473
Epoch 00031: val_acc did not improve
acc: 0.850 , loss: 0.346 , val_acc: 0.801 , val_loss: 0.483
Epoch 00032: val_acc did not improve
acc: 0.843 , loss: 0.359 , val_acc: 0.803 , val_loss: 0.474
Epoch 00033: val_acc did not improve
acc: 0.857 , loss: 0.338 , val_acc: 0.805 , val_loss: 0.487
Epoch 00034: val_acc did not improve
acc: 0.849 , loss: 0.346 , val_acc: 0.807 , val_loss: 0.481
Epoch 00035: val_acc did not improve
acc: 0.856 , loss: 0.332 , val_acc: 0.804 , val_loss: 0.478
Epoch 00036: val_acc did not improve
acc: 0.849 , loss: 0.339 , val_acc: 0.801 , val_loss: 0.506
Epoch 00037: val_acc did not improve
acc: 0.857 , loss: 0.326 , val_acc: 0.792 , val_loss: 0.518
Epoch 00038: val_acc did not improve
acc: 0.850 , loss: 0.333 , val_acc: 0.808 , val_loss: 0.483
Epoch 00039: val_acc improved from 0.80797 to 0.80839, saving model to CYP1A2_conv1.hdf5
acc: 0.858 , loss: 0.330 , val_acc: 0.769 , val_loss: 0.582
Epoch 00040: val_acc did not improve
acc: 0.861 , loss: 0.326 , val_acc: 0.801 , val_loss: 0.492
Epoch 00041: val_acc did not improve
acc: 0.861 , loss: 0.319 , val_acc: 0.800 , val_loss: 0.505
Epoch 00042: val_acc did not improve
acc: 0.860 , loss: 0.316 , val_acc: 0.796 , val_loss: 0.522
Epoch 00043: val_acc did not improve
acc: 0.861 , loss: 0.322 , val_acc: 0.787 , val_loss: 0.527
Epoch 00044: val_acc did not improve
acc: 0.873 , loss: 0.306 , val_acc: 0.804 , val_loss: 0.488
Epoch 00045: val_acc did not improve
acc: 0.867 , loss: 0.314 , val_acc: 0.809 , val_loss: 0.483
Epoch 00046: val_acc improved from 0.80839 to 0.80881, saving model to CYP1A2_conv1.hdf5
acc: 0.872 , loss: 0.304 , val_acc: 0.795 , val_loss: 0.550
Epoch 00047: val_acc did not improve
acc: 0.863 , loss: 0.323 , val_acc: 0.749 , val_loss: 0.684
Epoch 00048: val_acc did not improve
acc: 0.870 , loss: 0.303 , val_acc: 0.802 , val_loss: 0.497
Epoch 00049: val_acc did not improve
acc: 0.874 , loss: 0.307 , val_acc: 0.803 , val_loss: 0.494
Epoch 00050: val_acc did not improve
acc: 0.871 , loss: 0.300 , val_acc: 0.806 , val_loss: 0.519
Epoch 00051: val_acc did not improve
acc: 0.868 , loss: 0.309 , val_acc: 0.810 , val_loss: 0.511
Epoch 00052: val_acc improved from 0.80881 to 0.81006, saving model to CYP1A2_conv1.hdf5
acc: 0.871 , loss: 0.305 , val_acc: 0.809 , val_loss: 0.480
Epoch 00053: val_acc did not improve
acc: 0.874 , loss: 0.296 , val_acc: 0.810 , val_loss: 0.492
Epoch 00054: val_acc did not improve
acc: 0.872 , loss: 0.296 , val_acc: 0.812 , val_loss: 0.484
Epoch 00055: val_acc improved from 0.81006 to 0.81216, saving model to CYP1A2_conv1.hdf5
acc: 0.866 , loss: 0.309 , val_acc: 0.810 , val_loss: 0.510
Epoch 00056: val_acc did not improve
acc: 0.872 , loss: 0.297 , val_acc: 0.813 , val_loss: 0.489
Epoch 00057: val_acc improved from 0.81216 to 0.81258, saving model to CYP1A2_conv1.hdf5
acc: 0.873 , loss: 0.296 , val_acc: 0.792 , val_loss: 0.560
Epoch 00058: val_acc did not improve
acc: 0.877 , loss: 0.287 , val_acc: 0.807 , val_loss: 0.522
Epoch 00059: val_acc did not improve
acc: 0.876 , loss: 0.288 , val_acc: 0.812 , val_loss: 0.489
Epoch 00060: val_acc did not improve
acc: 0.874 , loss: 0.295 , val_acc: 0.809 , val_loss: 0.486
Epoch 00061: val_acc did not improve
acc: 0.875 , loss: 0.294 , val_acc: 0.810 , val_loss: 0.491
Epoch 00062: val_acc did not improve
acc: 0.877 , loss: 0.283 , val_acc: 0.797 , val_loss: 0.547
Epoch 00063: val_acc did not improve
acc: 0.877 , loss: 0.284 , val_acc: 0.803 , val_loss: 0.515
Epoch 00064: val_acc did not improve
acc: 0.883 , loss: 0.275 , val_acc: 0.806 , val_loss: 0.490
Epoch 00065: val_acc did not improve
acc: 0.882 , loss: 0.275 , val_acc: 0.799 , val_loss: 0.556
Epoch 00066: val_acc did not improve
acc: 0.884 , loss: 0.271 , val_acc: 0.804 , val_loss: 0.544
Epoch 00067: val_acc did not improve
acc: 0.878 , loss: 0.284 , val_acc: 0.806 , val_loss: 0.528
Epoch 00068: val_acc did not improve
acc: 0.888 , loss: 0.266 , val_acc: 0.806 , val_loss: 0.540
Epoch 00069: val_acc did not improve
acc: 0.884 , loss: 0.273 , val_acc: 0.803 , val_loss: 0.548
Epoch 00070: val_acc did not improve
acc: 0.886 , loss: 0.271 , val_acc: 0.795 , val_loss: 0.539
Epoch 00071: val_acc did not improve
acc: 0.888 , loss: 0.266 , val_acc: 0.797 , val_loss: 0.592
Epoch 00072: val_acc did not improve
acc: 0.890 , loss: 0.266 , val_acc: 0.802 , val_loss: 0.540
Epoch 00073: val_acc did not improve
acc: 0.887 , loss: 0.269 , val_acc: 0.794 , val_loss: 0.527
Epoch 00074: val_acc did not improve
acc: 0.887 , loss: 0.264 , val_acc: 0.798 , val_loss: 0.546
Epoch 00075: val_acc did not improve
acc: 0.887 , loss: 0.264 , val_acc: 0.799 , val_loss: 0.568
Epoch 00076: val_acc did not improve
acc: 0.895 , loss: 0.257 , val_acc: 0.805 , val_loss: 0.527
Epoch 00077: val_acc did not improve
acc: 0.890 , loss: 0.261 , val_acc: 0.797 , val_loss: 0.593
Epoch 00078: val_acc did not improve
acc: 0.889 , loss: 0.258 , val_acc: 0.806 , val_loss: 0.553
Epoch 00079: val_acc did not improve
acc: 0.891 , loss: 0.261 , val_acc: 0.799 , val_loss: 0.536
Epoch 00080: val_acc did not improve
acc: 0.893 , loss: 0.259 , val_acc: 0.806 , val_loss: 0.533
Epoch 00081: val_acc did not improve
acc: 0.893 , loss: 0.257 , val_acc: 0.805 , val_loss: 0.566
Epoch 00082: val_acc did not improve
acc: 0.887 , loss: 0.270 , val_acc: 0.795 , val_loss: 0.556
Epoch 00083: val_acc did not improve
acc: 0.891 , loss: 0.258 , val_acc: 0.798 , val_loss: 0.554
Epoch 00084: val_acc did not improve
acc: 0.894 , loss: 0.257 , val_acc: 0.789 , val_loss: 0.548
Epoch 00085: val_acc did not improve
acc: 0.889 , loss: 0.257 , val_acc: 0.795 , val_loss: 0.574
Epoch 00086: val_acc did not improve
acc: 0.889 , loss: 0.254 , val_acc: 0.808 , val_loss: 0.561
Epoch 00087: val_acc did not improve
acc: 0.890 , loss: 0.253 , val_acc: 0.769 , val_loss: 0.709
Epoch 00088: val_acc did not improve
acc: 0.899 , loss: 0.243 , val_acc: 0.777 , val_loss: 0.671
Epoch 00089: val_acc did not improve
acc: 0.893 , loss: 0.254 , val_acc: 0.806 , val_loss: 0.532
Epoch 00090: val_acc did not improve
acc: 0.896 , loss: 0.245 , val_acc: 0.790 , val_loss: 0.613
Epoch 00091: val_acc did not improve
acc: 0.892 , loss: 0.259 , val_acc: 0.808 , val_loss: 0.549
Epoch 00092: val_acc did not improve
acc: 0.889 , loss: 0.261 , val_acc: 0.802 , val_loss: 0.552
Epoch 00093: val_acc did not improve

In [24]:
from keras import models
best_model=models.load_model("CYP1A2_conv1_valacc820_testacc808auc883.hdf5")
print(best_model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_4 (Embedding)      (None, 226, 50)           2150      
_________________________________________________________________
dropout_6 (Dropout)          (None, 226, 50)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 220, 500)          175500    
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 220, 500)          0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 220, 500)          0         
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 500)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 500)               250500    
_________________________________________________________________
dropout_8 (Dropout)          (None, 500)               0         
_________________________________________________________________
leaky_re_lu_6 (LeakyReLU)    (None, 500)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 501       
_________________________________________________________________
activation_4 (Activation)    (None, 1)                 0         
=================================================================
Total params: 428,651
Trainable params: 428,651
Non-trainable params: 0
_________________________________________________________________
None

In [40]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
y_score=model.predict_proba(X_test)
y_score_get_value=[]
for item in y_score:
    y_score_get_value.append(item[0])
    
fpr, tpr, _ = roc_curve(y_test, y_score_get_value)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.3f)' % auc(fpr,tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic ')
plt.legend(loc="lower right")
plt.savefig("roc.svg", format="svg")
plt.show()


2688/2981 [==========================>...] - ETA: 0s