In [1]:
from __future__ import division
import pandas as pd
import numpy as np
churn_df = pd.read_csv('\\\\svrau085ssm01.oceania.corp.anz.com\\torriea$\\My Documents\\bokeh_data\\churn.csv')
col_names = churn_df.columns.tolist()
print "Column names:"
print col_names
to_show = col_names[:6] + col_names[-6:]
print "\nSample data:"
churn_df[to_show].head(6)
Out[1]:
In [2]:
# isolate target data
# by converting True. -> 1, and False. -> 0
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)
In [3]:
# we don't need these columns
to_drop = ['State', 'Area Code', 'Phone', 'Churn?']
churn_feat_space = churn_df.drop(to_drop, axis = 1)
In [4]:
# 'yes'/'no' has to be converted to boolean values
# numpy converts these from boolean to 1. and 0. later
yes_no_cols = ["Int'l Plan", "VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
In [5]:
# pull out features for future use
features = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)
In [6]:
# this is important (why??)
# Many predictors care about the relative size of different features even though those scales might be arbitrary.
# For instance: the number of points a basketball team scores per game will naturally be a couple orders of magnitude
# larger than their win percentage. But this doesn't mean that the latter is 100 times less significant. StandardScaler
# fixes this by normalizing each feature to a range of around 1.0 to -1.0
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)
In [7]:
# cross validation
from sklearn.cross_validation import KFold
def run_cv(X, y, clf_class, **kwargs):
# construct a kfolds object
kf = KFold(len(y), n_folds = 5, shuffle = True)
y_pred = y.copy()
# iterate through folds
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
# initialise a classifier with key word arguments
clf = clf_class(**kwargs)
#clf.fit(X_train, y_train)
y_pred[test_index] = clf.fit(X_train, y_train).predict(X_test)
return y_pred
In [8]:
# compare three fairly unique algorithms support vector machines, random forest, and k-nearest-neighbors.
# Nothing fancy here, just passing each to cross validation and determining how often the classifier
# predicted the correct class
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import accuracy_score
print "Support vector machines:"
print "%.3f" % accuracy_score(y, run_cv(X, y, SVC))
print "\nRandom forest:"
print "%.3f" % accuracy_score(y, run_cv(X, y, RF))
print "\nK-nearest-neighbours:"
print "%.3f" % accuracy_score(y, run_cv(X, y, KNN))
In [9]:
# confusion matrix
# The x-axis indicates the true class of each observation (if a customer churned or not)
# while the y-axis corresponds to the class predicted by the model (if my classifier said a customer would churned or not).
from sklearn.metrics import confusion_matrix
y = np.array(y)
class_names = np.unique(y)
confusion_matrices = [
("Support Vector Machine", confusion_matrix(y, run_cv(X, y, SVC))),
("Random Forest", confusion_matrix(y, run_cv(X, y, RF))),
("K-Nearest-Neighbour", confusion_matrix(y, run_cv(X, y, KNN))),
]
In [10]:
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
tick_marks = np.arange(len(class_names))
outcome_annotation = dict(fc='w',boxstyle='round, pad=1')
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar(fraction=0.045)
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
plt.tight_layout()
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
# outcomes
tn = cm[0, 0]; fp = cm[0, 1]; fn = cm[1, 0]; tp = cm[1, 1]; nobs = cm.sum()
# true negative annotation
plt.text(0, 0, 'TN: %d' % tn, va='center', ha='center', bbox=outcome_annotation)
# false negative annotation
plt.text(0, 1, 'FN: %d' % fn, va='center', ha='center', bbox=outcome_annotation)
# false positive annotation
plt.text(1, 0, 'FP: %d' % fp, va='center', ha='center', bbox=outcome_annotation)
# true positive annotation
plt.text(1, 1, 'TP: %d' % tp, va='center', ha='center', bbox=outcome_annotation)
def draw_confusion_matrices(confusion_matrices, class_names):
fig = plt.figure(num = 1, figsize=(14, 8))
for index, cm in enumerate(confusion_matrices):
fig.add_subplot(130 + index + 1)
plot_confusion_matrix(cm[1], cm[0])
plt.show()
In [11]:
draw_confusion_matrices(confusion_matrices, class_names)
for index, item in enumerate(confusion_matrices):
name = item[0]
cm = item[1]
# outcomes
tn = cm[0, 0]; fp = cm[0, 1]; fn = cm[1, 0]; tp = cm[1, 1]; nobs = cm.sum()
# performance metrics
actual_positive = tp + fn + 0.
actual_negative = fp + tn + 0.
predicted_positive = tp + fp + 0.
accuracy = (tp + tn + 0.) / nobs
error_rate = 1. - accuracy
recall = tp / actual_positive
false_positive_rate = fp / actual_negative
false_negative_rate = fn / actual_positive
specificity = 1. - false_positive_rate
precision = tp / predicted_positive
prevelance = actual_positive / nobs
positive_likelihood_ratio = recall / false_positive_rate
negative_likelihood_ratio = false_negative_rate / specificity
diagnostic_odds_ratio = positive_likelihood_ratio / negative_likelihood_ratio
print "\n" + name
print "=" * len(name)
print "\naccuracy\t\t\t%.2f%%" % (accuracy * 100) + "\t\tOverall, how often is the classifier correct?"
print "\nerror_rate\t\t\t%.2f%%" % (error_rate * 100) + "\t\tOverall, how often is it wrong?"
print "\nrecall\t\t\t\t%.2f%%" % (recall * 100) + "\t\tWhen it's actually yes, how often does it predict yes?"
print "\nfalse_positive_rate\t\t%.2f%%" % (false_positive_rate * 100) + "\t\tWhen it's actually no, how often does it predict yes? (Type I error)"
print "\nfalse_negative_rate\t\t%.2f%%" % (false_negative_rate * 100) + "\t\tWhen it's actually yes, how often does it predict no? (Type II error)"
print "\nspecificity\t\t\t%.2f%%" % (specificity * 100) + "\t\tWhen it's actually no, how often does it predict no?"
print "\nprecision\t\t\t%.2f%%" % (precision * 100) + "\t\tWhen it predicts yes, how often is it correct?"
print "\nprevelance\t\t\t%.2f%%" % (prevelance * 100) + "\t\tHow often does the yes condition actually occur in our sample?"
print "\npositive_likelihood_ratio\t%.2f" % positive_likelihood_ratio + "\t\t"
print "\nnegative_likelihood_ratio\t%.2f" % negative_likelihood_ratio + "\t\t"
print "\ndiagnostic_odds_ratio\t\t%.2f" % (diagnostic_odds_ratio) + "\t\tHigher diagnostic odds ratios are indicative of better test performance"
print "\n"