In [22]:

    
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

Train/Test split already done



In [23]:

    
#from sklearn.cross_validation import train_test_split

# create 80%-20% train-test split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5555)



In [24]:

    
twoD6_test = pd.read_csv("data/test2d6.csv", index_col='SID')
twoD6_train = pd.read_csv("data/training2d6.csv", index_col='SID')



In [25]:

    
twoD6_train.head()









    Out[25]:






  
    
      
      ActivityScore
      apol
      a_acc
      a_acid
      a_aro
      a_base
      a_count
      a_don
      a_heavy
      a_hyd
      ...
      vsa_acid
      vsa_base
      vsa_don
      vsa_hyd
      vsa_other
      vsa_pol
      Weight
      weinerPath
      weinerPol
      zagreb
    
    
      SID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      11114095
       85
       25.967930
       0
       5
        0
       1
       23
       0
       13
        5
      ...
       74.321251
       17.742489
       17.742489
        71.454041
       28.375498
       92.063744
       207.12199
        272
       13
        58
    
    
      11111502
       41
       52.303032
       0
       0
       12
       1
       45
       0
       21
       20
      ...
        0.000000
        5.682576
        5.682576
       254.943600
        0.000000
        5.682576
       278.41901
        810
       40
       118
    
    
      11111413
        0
       35.143032
       0
       0
        0
       3
       36
       0
       12
        9
      ...
        0.000000
       41.167557
       41.167557
       133.040590
        0.000000
       41.167557
       174.31200
        215
       13
        52
    
    
      11112630
       20
       32.461517
       1
       0
        6
       0
       26
       0
       14
       11
      ...
        0.000000
        0.000000
        0.000000
       166.094760
       13.166624
        5.682576
       204.29700
        298
       17
        76
    
    
      11110827
        0
       64.246994
       5
       0
        0
       0
       57
       3
       28
       21
      ...
        0.000000
        0.000000
        0.000000
       267.817900
       16.917038
       67.834602
       408.92200
       1670
       70
       168
    
  

5 rows × 187 columns



In [26]:

    
col_names2D6 = twoD6_train.columns.tolist()

print('Column names:')
print(col_names2D6)









    



Column names:
['ActivityScore', 'apol', 'a_acc', 'a_acid', 'a_aro', 'a_base', 'a_count', 'a_don', 'a_heavy', 'a_hyd', 'a_IC', 'a_ICM', 'a_nB', 'a_nBr', 'a_nC', 'a_nCl', 'a_nF', 'a_nH', 'a_nI', 'a_nN', 'a_nO', 'a_nP', 'a_nS', 'balabanJ', 'BCUT_PEOE_0', 'BCUT_PEOE_1', 'BCUT_PEOE_2', 'BCUT_PEOE_3', 'BCUT_SLOGP_0', 'BCUT_SLOGP_1', 'BCUT_SLOGP_2', 'BCUT_SLOGP_3', 'BCUT_SMR_0', 'BCUT_SMR_1', 'BCUT_SMR_2', 'BCUT_SMR_3', 'bpol', 'b_1rotN', 'b_1rotR', 'b_ar', 'b_count', 'b_double', 'b_heavy', 'b_rotN', 'b_rotR', 'b_single', 'b_triple', 'chi0', 'chi0v', 'chi0v_C', 'chi0_C', 'chi1', 'chi1v', 'chi1v_C', 'chi1_C', 'chiral', 'chiral_u', 'density', 'diameter', 'FCharge', 'GCUT_PEOE_0', 'GCUT_PEOE_1', 'GCUT_PEOE_2', 'GCUT_PEOE_3', 'GCUT_SLOGP_0', 'GCUT_SLOGP_1', 'GCUT_SLOGP_2', 'GCUT_SLOGP_3', 'GCUT_SMR_0', 'GCUT_SMR_1', 'GCUT_SMR_2', 'GCUT_SMR_3', 'Kier1', 'Kier2', 'Kier3', 'KierA1', 'KierA2', 'KierA3', 'KierFlex', 'lip_acc', 'lip_don', 'lip_druglike', 'lip_violation', 'logP(o/w)', 'logS', 'mr', 'mutagenic', 'nmol', 'opr_brigid', 'opr_leadlike', 'opr_nring', 'opr_nrot', 'opr_violation', 'PC+', 'PC-', 'PEOE_PC+', 'PEOE_PC-', 'PEOE_RPC+', 'PEOE_RPC-', 'PEOE_VSA+0', 'PEOE_VSA+1', 'PEOE_VSA+2', 'PEOE_VSA+3', 'PEOE_VSA+4', 'PEOE_VSA+5', 'PEOE_VSA+6', 'PEOE_VSA-0', 'PEOE_VSA-1', 'PEOE_VSA-2', 'PEOE_VSA-3', 'PEOE_VSA-4', 'PEOE_VSA-5', 'PEOE_VSA-6', 'PEOE_VSA_FHYD', 'PEOE_VSA_FNEG', 'PEOE_VSA_FPNEG', 'PEOE_VSA_FPOL', 'PEOE_VSA_FPOS', 'PEOE_VSA_FPPOS', 'PEOE_VSA_HYD', 'PEOE_VSA_NEG', 'PEOE_VSA_PNEG', 'PEOE_VSA_POL', 'PEOE_VSA_POS', 'PEOE_VSA_PPOS', 'petitjean', 'petitjeanSC', 'Q_PC+', 'Q_PC-', 'Q_RPC+', 'Q_RPC-', 'Q_VSA_FHYD', 'Q_VSA_FNEG', 'Q_VSA_FPNEG', 'Q_VSA_FPOL', 'Q_VSA_FPOS', 'Q_VSA_FPPOS', 'Q_VSA_HYD', 'Q_VSA_NEG', 'Q_VSA_PNEG', 'Q_VSA_POL', 'Q_VSA_POS', 'Q_VSA_PPOS', 'radius', 'reactive', 'rings', 'RPC+', 'RPC-', 'rsynth', 'SlogP', 'SlogP_VSA0', 'SlogP_VSA1', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'SMR', 'SMR_VSA0', 'SMR_VSA1', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'TPSA', 'VAdjEq', 'VAdjMa', 'VDistEq', 'VDistMa', 'vdw_area', 'vdw_vol', 'vsa_acc', 'vsa_acid', 'vsa_base', 'vsa_don', 'vsa_hyd', 'vsa_other', 'vsa_pol', 'Weight', 'weinerPath', 'weinerPol', 'zagreb']



In [27]:

    
# Isolate response variable
ActivityScore = twoD6_train['ActivityScore']
y_train = np.where(ActivityScore >= 40,1,0)

ActivityScore2 = twoD6_test['ActivityScore']
y_test = np.where(ActivityScore2 >= 40,1,0)



In [28]:

    
# looks right sized
y_train.shape, y_test.shape









    Out[28]:





((4433,), (1109,))



In [29]:

    
y_test









    Out[29]:





array([1, 1, 1, ..., 1, 0, 1])



In [30]:

    
# We don't need this column anymore
to_drop = ['ActivityScore']
inhib_feat_space = twoD6_train.drop(to_drop,axis=1)
inhib_feat_space_test = twoD6_test.drop(to_drop,axis=1)



In [31]:

    
# Pull out features for future use
features = inhib_feat_space.columns
features_test = inhib_feat_space_test.columns



In [32]:

    
X_train = inhib_feat_space.as_matrix().astype(np.float)
X_test = inhib_feat_space_test.as_matrix().astype(np.float)



In [33]:

    
X_train.shape, X_test.shape









    Out[33]:





((4433, 186), (1109, 186))



In [34]:

    
n_pos1 = y_test.sum()
n_pos1









    Out[34]:





552



In [35]:

    
n_pos2 = y_train.sum()
n_pos2









    Out[35]:





2219



In [36]:

    
print('Feature space holds '+repr(X_train.shape[0])+' observations and '+repr(X_test.shape[1])+' features')
print('Unique target labels: '+repr(np.unique(y_train)))

print('Feature space holds '+repr(X_test.shape[0])+' observations and '+repr(X_test.shape[1])+' features')
print('Unique target labels: '+repr(np.unique(y_test)))









    



Feature space holds 4433 observations and 186 features
Unique target labels: array([0, 1])
Feature space holds 1109 observations and 186 features
Unique target labels: array([0, 1])



In [37]:

    
X_test.shape[1]









    Out[37]:





186

Scale the features before training model



In [38]:

    
# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)



In [39]:

    
from sklearn.cross_validation import KFold

def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()
    
    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred



In [40]:

    
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

def accuracy(y_true,y_pred):
    # NumPy interpretes True and False as 1. and 0.
    return np.mean(y_true == y_pred)

print("K-nearest-neighbors (training set):")
print("%.3f" % accuracy(y_train, run_cv(X_train,y_train,KNN)))
print("K-nearest-neighbors (test set):")
print("%.3f" % accuracy(y_test, run_cv(X_test,y_test,KNN)))
print('Support vector machines (training set):')
print("%.3f" % accuracy(y_train, run_cv(X_train,y_train,SVC)))
print('Support vector machines (test set):')
print("%.3f" % accuracy(y_test, run_cv(X_test,y_test,SVC)))
print("Random forest (training set):")
print("%.3f" % accuracy(y_train, run_cv(X_train,y_train,RF)))
print("Random forest (test set):")
print("%.3f" % accuracy(y_test, run_cv(X_test,y_test,RF)))









    



K-nearest-neighbors (training set):
0.717
K-nearest-neighbors (test set):
0.678
Support vector machines (training set):
0.755
Support vector machines (test set):
0.725
Random forest (training set):
0.729
Random forest (test set):
0.707



In [41]:

    
from sklearn.metrics import confusion_matrix

y_train = np.array(y_train)
class_names = np.unique(y_train)

confusion_matrices_training = [
    ( "K-Nearest-Neighbors training", confusion_matrix(y_train,run_cv(X_train,y_train,KNN)) ),
    ( "Support Vector Machines training", confusion_matrix(y_train,run_cv(X_train,y_train,SVC)) ),
    ( "Random Forest taining", confusion_matrix(y_train,run_cv(X_train,y_train,RF)) ),
]

y_test = np.array(y_test)
class_names = np.unique(y_test)

confusion_matrices_test = [
    ( "K-Nearest-Neighbors test", confusion_matrix(y_test,run_cv(X_test,y_test,KNN)) ),
    ( "Support Vector Machines test", confusion_matrix(y_test,run_cv(X_test,y_test,SVC)) ),
    ( "Random Forest test", confusion_matrix(y_test,run_cv(X_test,y_test,RF)) ),
]

#draw_confusion_matrices(confusion_matrices,class_names)
confusion_matrices_training, confusion_matrices_test









    Out[41]:





([('K-Nearest-Neighbors training', array([[1537,  677],
          [ 553, 1666]])),
  ('Support Vector Machines training', array([[1687,  527],
          [ 543, 1676]])),
  ('Random Forest taining', array([[1732,  482],
          [ 760, 1459]]))],
 [('K-Nearest-Neighbors test', array([[366, 191],
          [171, 381]])), ('Support Vector Machines test', array([[440, 117],
          [184, 368]])), ('Random Forest test', array([[423, 134],
          [218, 334]]))])



In [46]:

    
roc_auc_score(is_churn, pred_churn)









    Out[46]:





0.79710213956578524



In [47]:

	ActivityScore	apol	a_acc	a_acid	a_aro	a_base	a_count	a_don	a_heavy	a_hyd	...	vsa_acid	vsa_base	vsa_don	vsa_hyd	vsa_other	vsa_pol	Weight	weinerPath	weinerPol	zagreb
SID
11114095	85	25.967930	0	5	0	1	23	0	13	5	...	74.321251	17.742489	17.742489	71.454041	28.375498	92.063744	207.12199	272	13	58
11111502	41	52.303032	0	0	12	1	45	0	21	20	...	0.000000	5.682576	5.682576	254.943600	0.000000	5.682576	278.41901	810	40	118
11111413	0	35.143032	0	0	0	3	36	0	12	9	...	0.000000	41.167557	41.167557	133.040590	0.000000	41.167557	174.31200	215	13	52
11112630	20	32.461517	1	0	6	0	26	0	14	11	...	0.000000	0.000000	0.000000	166.094760	13.166624	5.682576	204.29700	298	17	76
11110827	0	64.246994	5	0	0	0	57	3	28	21	...	0.000000	0.000000	0.000000	267.817900	16.917038	67.834602	408.92200	1670	70	168