In [22]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

Train/Test split already done


In [23]:
#from sklearn.cross_validation import train_test_split

# create 80%-20% train-test split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5555)

In [24]:
twoD6_test = pd.read_csv("data/test2d6.csv", index_col='SID')
twoD6_train = pd.read_csv("data/training2d6.csv", index_col='SID')

In [25]:
twoD6_train.head()


Out[25]:
ActivityScore apol a_acc a_acid a_aro a_base a_count a_don a_heavy a_hyd ... vsa_acid vsa_base vsa_don vsa_hyd vsa_other vsa_pol Weight weinerPath weinerPol zagreb
SID
11114095 85 25.967930 0 5 0 1 23 0 13 5 ... 74.321251 17.742489 17.742489 71.454041 28.375498 92.063744 207.12199 272 13 58
11111502 41 52.303032 0 0 12 1 45 0 21 20 ... 0.000000 5.682576 5.682576 254.943600 0.000000 5.682576 278.41901 810 40 118
11111413 0 35.143032 0 0 0 3 36 0 12 9 ... 0.000000 41.167557 41.167557 133.040590 0.000000 41.167557 174.31200 215 13 52
11112630 20 32.461517 1 0 6 0 26 0 14 11 ... 0.000000 0.000000 0.000000 166.094760 13.166624 5.682576 204.29700 298 17 76
11110827 0 64.246994 5 0 0 0 57 3 28 21 ... 0.000000 0.000000 0.000000 267.817900 16.917038 67.834602 408.92200 1670 70 168

5 rows × 187 columns


In [26]:
col_names2D6 = twoD6_train.columns.tolist()

print('Column names:')
print(col_names2D6)


Column names:
['ActivityScore', 'apol', 'a_acc', 'a_acid', 'a_aro', 'a_base', 'a_count', 'a_don', 'a_heavy', 'a_hyd', 'a_IC', 'a_ICM', 'a_nB', 'a_nBr', 'a_nC', 'a_nCl', 'a_nF', 'a_nH', 'a_nI', 'a_nN', 'a_nO', 'a_nP', 'a_nS', 'balabanJ', 'BCUT_PEOE_0', 'BCUT_PEOE_1', 'BCUT_PEOE_2', 'BCUT_PEOE_3', 'BCUT_SLOGP_0', 'BCUT_SLOGP_1', 'BCUT_SLOGP_2', 'BCUT_SLOGP_3', 'BCUT_SMR_0', 'BCUT_SMR_1', 'BCUT_SMR_2', 'BCUT_SMR_3', 'bpol', 'b_1rotN', 'b_1rotR', 'b_ar', 'b_count', 'b_double', 'b_heavy', 'b_rotN', 'b_rotR', 'b_single', 'b_triple', 'chi0', 'chi0v', 'chi0v_C', 'chi0_C', 'chi1', 'chi1v', 'chi1v_C', 'chi1_C', 'chiral', 'chiral_u', 'density', 'diameter', 'FCharge', 'GCUT_PEOE_0', 'GCUT_PEOE_1', 'GCUT_PEOE_2', 'GCUT_PEOE_3', 'GCUT_SLOGP_0', 'GCUT_SLOGP_1', 'GCUT_SLOGP_2', 'GCUT_SLOGP_3', 'GCUT_SMR_0', 'GCUT_SMR_1', 'GCUT_SMR_2', 'GCUT_SMR_3', 'Kier1', 'Kier2', 'Kier3', 'KierA1', 'KierA2', 'KierA3', 'KierFlex', 'lip_acc', 'lip_don', 'lip_druglike', 'lip_violation', 'logP(o/w)', 'logS', 'mr', 'mutagenic', 'nmol', 'opr_brigid', 'opr_leadlike', 'opr_nring', 'opr_nrot', 'opr_violation', 'PC+', 'PC-', 'PEOE_PC+', 'PEOE_PC-', 'PEOE_RPC+', 'PEOE_RPC-', 'PEOE_VSA+0', 'PEOE_VSA+1', 'PEOE_VSA+2', 'PEOE_VSA+3', 'PEOE_VSA+4', 'PEOE_VSA+5', 'PEOE_VSA+6', 'PEOE_VSA-0', 'PEOE_VSA-1', 'PEOE_VSA-2', 'PEOE_VSA-3', 'PEOE_VSA-4', 'PEOE_VSA-5', 'PEOE_VSA-6', 'PEOE_VSA_FHYD', 'PEOE_VSA_FNEG', 'PEOE_VSA_FPNEG', 'PEOE_VSA_FPOL', 'PEOE_VSA_FPOS', 'PEOE_VSA_FPPOS', 'PEOE_VSA_HYD', 'PEOE_VSA_NEG', 'PEOE_VSA_PNEG', 'PEOE_VSA_POL', 'PEOE_VSA_POS', 'PEOE_VSA_PPOS', 'petitjean', 'petitjeanSC', 'Q_PC+', 'Q_PC-', 'Q_RPC+', 'Q_RPC-', 'Q_VSA_FHYD', 'Q_VSA_FNEG', 'Q_VSA_FPNEG', 'Q_VSA_FPOL', 'Q_VSA_FPOS', 'Q_VSA_FPPOS', 'Q_VSA_HYD', 'Q_VSA_NEG', 'Q_VSA_PNEG', 'Q_VSA_POL', 'Q_VSA_POS', 'Q_VSA_PPOS', 'radius', 'reactive', 'rings', 'RPC+', 'RPC-', 'rsynth', 'SlogP', 'SlogP_VSA0', 'SlogP_VSA1', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'SMR', 'SMR_VSA0', 'SMR_VSA1', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'TPSA', 'VAdjEq', 'VAdjMa', 'VDistEq', 'VDistMa', 'vdw_area', 'vdw_vol', 'vsa_acc', 'vsa_acid', 'vsa_base', 'vsa_don', 'vsa_hyd', 'vsa_other', 'vsa_pol', 'Weight', 'weinerPath', 'weinerPol', 'zagreb']

In [27]:
# Isolate response variable
ActivityScore = twoD6_train['ActivityScore']
y_train = np.where(ActivityScore >= 40,1,0)

ActivityScore2 = twoD6_test['ActivityScore']
y_test = np.where(ActivityScore2 >= 40,1,0)

In [28]:
# looks right sized
y_train.shape, y_test.shape


Out[28]:
((4433,), (1109,))

In [29]:
y_test


Out[29]:
array([1, 1, 1, ..., 1, 0, 1])

In [30]:
# We don't need this column anymore
to_drop = ['ActivityScore']
inhib_feat_space = twoD6_train.drop(to_drop,axis=1)
inhib_feat_space_test = twoD6_test.drop(to_drop,axis=1)

In [31]:
# Pull out features for future use
features = inhib_feat_space.columns
features_test = inhib_feat_space_test.columns

In [32]:
X_train = inhib_feat_space.as_matrix().astype(np.float)
X_test = inhib_feat_space_test.as_matrix().astype(np.float)

In [33]:
X_train.shape, X_test.shape


Out[33]:
((4433, 186), (1109, 186))

In [34]:
n_pos1 = y_test.sum()
n_pos1


Out[34]:
552

In [35]:
n_pos2 = y_train.sum()
n_pos2


Out[35]:
2219

In [36]:
print('Feature space holds '+repr(X_train.shape[0])+' observations and '+repr(X_test.shape[1])+' features')
print('Unique target labels: '+repr(np.unique(y_train)))

print('Feature space holds '+repr(X_test.shape[0])+' observations and '+repr(X_test.shape[1])+' features')
print('Unique target labels: '+repr(np.unique(y_test)))


Feature space holds 4433 observations and 186 features
Unique target labels: array([0, 1])
Feature space holds 1109 observations and 186 features
Unique target labels: array([0, 1])

In [37]:
X_test.shape[1]


Out[37]:
186

Scale the features before training model


In [38]:
# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [39]:
from sklearn.cross_validation import KFold

def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()
    
    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

In [40]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

def accuracy(y_true,y_pred):
    # NumPy interpretes True and False as 1. and 0.
    return np.mean(y_true == y_pred)

print("K-nearest-neighbors (training set):")
print("%.3f" % accuracy(y_train, run_cv(X_train,y_train,KNN)))
print("K-nearest-neighbors (test set):")
print("%.3f" % accuracy(y_test, run_cv(X_test,y_test,KNN)))
print('Support vector machines (training set):')
print("%.3f" % accuracy(y_train, run_cv(X_train,y_train,SVC)))
print('Support vector machines (test set):')
print("%.3f" % accuracy(y_test, run_cv(X_test,y_test,SVC)))
print("Random forest (training set):")
print("%.3f" % accuracy(y_train, run_cv(X_train,y_train,RF)))
print("Random forest (test set):")
print("%.3f" % accuracy(y_test, run_cv(X_test,y_test,RF)))


K-nearest-neighbors (training set):
0.717
K-nearest-neighbors (test set):
0.678
Support vector machines (training set):
0.755
Support vector machines (test set):
0.725
Random forest (training set):
0.729
Random forest (test set):
0.707

In [41]:
from sklearn.metrics import confusion_matrix

y_train = np.array(y_train)
class_names = np.unique(y_train)

confusion_matrices_training = [
    ( "K-Nearest-Neighbors training", confusion_matrix(y_train,run_cv(X_train,y_train,KNN)) ),
    ( "Support Vector Machines training", confusion_matrix(y_train,run_cv(X_train,y_train,SVC)) ),
    ( "Random Forest taining", confusion_matrix(y_train,run_cv(X_train,y_train,RF)) ),
]

y_test = np.array(y_test)
class_names = np.unique(y_test)

confusion_matrices_test = [
    ( "K-Nearest-Neighbors test", confusion_matrix(y_test,run_cv(X_test,y_test,KNN)) ),
    ( "Support Vector Machines test", confusion_matrix(y_test,run_cv(X_test,y_test,SVC)) ),
    ( "Random Forest test", confusion_matrix(y_test,run_cv(X_test,y_test,RF)) ),
]

#draw_confusion_matrices(confusion_matrices,class_names)
confusion_matrices_training, confusion_matrices_test


Out[41]:
([('K-Nearest-Neighbors training', array([[1537,  677],
          [ 553, 1666]])),
  ('Support Vector Machines training', array([[1687,  527],
          [ 543, 1676]])),
  ('Random Forest taining', array([[1732,  482],
          [ 760, 1459]]))],
 [('K-Nearest-Neighbors test', array([[366, 191],
          [171, 381]])), ('Support Vector Machines test', array([[440, 117],
          [184, 368]])), ('Random Forest test', array([[423, 134],
          [218, 334]]))])

In [46]:
roc_auc_score(is_churn, pred_churn)


Out[46]:
0.79710213956578524

In [47]: