In [22]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
In [23]:
#from sklearn.cross_validation import train_test_split
# create 80%-20% train-test split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5555)
In [24]:
twoD6_test = pd.read_csv("data/test2d6.csv", index_col='SID')
twoD6_train = pd.read_csv("data/training2d6.csv", index_col='SID')
In [25]:
twoD6_train.head()
Out[25]:
In [26]:
col_names2D6 = twoD6_train.columns.tolist()
print('Column names:')
print(col_names2D6)
In [27]:
# Isolate response variable
ActivityScore = twoD6_train['ActivityScore']
y_train = np.where(ActivityScore >= 40,1,0)
ActivityScore2 = twoD6_test['ActivityScore']
y_test = np.where(ActivityScore2 >= 40,1,0)
In [28]:
# looks right sized
y_train.shape, y_test.shape
Out[28]:
In [29]:
y_test
Out[29]:
In [30]:
# We don't need this column anymore
to_drop = ['ActivityScore']
inhib_feat_space = twoD6_train.drop(to_drop,axis=1)
inhib_feat_space_test = twoD6_test.drop(to_drop,axis=1)
In [31]:
# Pull out features for future use
features = inhib_feat_space.columns
features_test = inhib_feat_space_test.columns
In [32]:
X_train = inhib_feat_space.as_matrix().astype(np.float)
X_test = inhib_feat_space_test.as_matrix().astype(np.float)
In [33]:
X_train.shape, X_test.shape
Out[33]:
In [34]:
n_pos1 = y_test.sum()
n_pos1
Out[34]:
In [35]:
n_pos2 = y_train.sum()
n_pos2
Out[35]:
In [36]:
print('Feature space holds '+repr(X_train.shape[0])+' observations and '+repr(X_test.shape[1])+' features')
print('Unique target labels: '+repr(np.unique(y_train)))
print('Feature space holds '+repr(X_test.shape[0])+' observations and '+repr(X_test.shape[1])+' features')
print('Unique target labels: '+repr(np.unique(y_test)))
In [37]:
X_test.shape[1]
Out[37]:
In [38]:
# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
In [39]:
from sklearn.cross_validation import KFold
def run_cv(X,y,clf_class,**kwargs):
# Construct a kfolds object
kf = KFold(len(y),n_folds=5,shuffle=True)
y_pred = y.copy()
# Iterate through folds
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
# Initialize a classifier with key word arguments
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
In [40]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
def accuracy(y_true,y_pred):
# NumPy interpretes True and False as 1. and 0.
return np.mean(y_true == y_pred)
print("K-nearest-neighbors (training set):")
print("%.3f" % accuracy(y_train, run_cv(X_train,y_train,KNN)))
print("K-nearest-neighbors (test set):")
print("%.3f" % accuracy(y_test, run_cv(X_test,y_test,KNN)))
print('Support vector machines (training set):')
print("%.3f" % accuracy(y_train, run_cv(X_train,y_train,SVC)))
print('Support vector machines (test set):')
print("%.3f" % accuracy(y_test, run_cv(X_test,y_test,SVC)))
print("Random forest (training set):")
print("%.3f" % accuracy(y_train, run_cv(X_train,y_train,RF)))
print("Random forest (test set):")
print("%.3f" % accuracy(y_test, run_cv(X_test,y_test,RF)))
In [41]:
from sklearn.metrics import confusion_matrix
y_train = np.array(y_train)
class_names = np.unique(y_train)
confusion_matrices_training = [
( "K-Nearest-Neighbors training", confusion_matrix(y_train,run_cv(X_train,y_train,KNN)) ),
( "Support Vector Machines training", confusion_matrix(y_train,run_cv(X_train,y_train,SVC)) ),
( "Random Forest taining", confusion_matrix(y_train,run_cv(X_train,y_train,RF)) ),
]
y_test = np.array(y_test)
class_names = np.unique(y_test)
confusion_matrices_test = [
( "K-Nearest-Neighbors test", confusion_matrix(y_test,run_cv(X_test,y_test,KNN)) ),
( "Support Vector Machines test", confusion_matrix(y_test,run_cv(X_test,y_test,SVC)) ),
( "Random Forest test", confusion_matrix(y_test,run_cv(X_test,y_test,RF)) ),
]
#draw_confusion_matrices(confusion_matrices,class_names)
confusion_matrices_training, confusion_matrices_test
Out[41]:
In [46]:
roc_auc_score(is_churn, pred_churn)
Out[46]:
In [47]: