In [1]:
from __future__ import division
import pandas as pd
import numpy as np

churn_df = pd.read_csv('\\\\svrau085ssm01.oceania.corp.anz.com\\torriea$\\My Documents\\bokeh_data\\churn.csv')
col_names = churn_df.columns.tolist()

print "Column names:"
print col_names

to_show = col_names[:6] + col_names[-6:]

print "\nSample data:"
churn_df[to_show].head(6)


Column names:
['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']

Sample data:
Out[1]:
State Account Length Area Code Phone Int'l Plan VMail Plan Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn?
0 KS 128 415 382-4657 no yes 11.01 10.0 3 2.70 1 False.
1 OH 107 415 371-7191 no yes 11.45 13.7 3 3.70 1 False.
2 NJ 137 415 358-1921 no no 7.32 12.2 5 3.29 0 False.
3 OH 84 408 375-9999 yes no 8.86 6.6 7 1.78 2 False.
4 OK 75 415 330-6626 yes no 8.41 10.1 3 2.73 3 False.
5 AL 118 510 391-8027 yes no 9.18 6.3 6 1.70 0 False.

In [2]:
# isolate target data
# by converting True. -> 1, and False. -> 0
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)

In [3]:
# we don't need these columns
to_drop = ['State', 'Area Code', 'Phone', 'Churn?']
churn_feat_space = churn_df.drop(to_drop, axis = 1)

In [4]:
# 'yes'/'no' has to be converted to boolean values
# numpy converts these from boolean to 1. and 0. later
yes_no_cols = ["Int'l Plan", "VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

In [5]:
# pull out features for future use
features = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)

In [6]:
# this is important (why??)
# Many predictors care about the relative size of different features even though those scales might be arbitrary. 
# For instance: the number of points a basketball team scores per game will naturally be a couple orders of magnitude 
# larger than their win percentage. But this doesn't mean that the latter is 100 times less significant. StandardScaler 
# fixes this by normalizing each feature to a range of around 1.0 to -1.0
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)


Feature space holds 3333 observations and 17 features
Unique target labels: [0 1]

In [7]:
# cross validation
from sklearn.cross_validation import KFold

def run_cv(X, y, clf_class, **kwargs):
    # construct a kfolds object
    kf = KFold(len(y), n_folds = 5, shuffle = True)
    y_pred = y.copy()
    
    # iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        
        # initialise a classifier with key word arguments
        clf = clf_class(**kwargs)
        #clf.fit(X_train, y_train)
        y_pred[test_index] = clf.fit(X_train, y_train).predict(X_test)
        
    return y_pred

In [8]:
# compare three fairly unique algorithms support vector machines, random forest, and k-nearest-neighbors. 
# Nothing fancy here, just passing each to cross validation and determining how often the classifier 
# predicted the correct class
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import accuracy_score

print "Support vector machines:"
print "%.3f" % accuracy_score(y, run_cv(X, y, SVC))
print "\nRandom forest:"
print "%.3f" % accuracy_score(y, run_cv(X, y, RF))
print "\nK-nearest-neighbours:"
print "%.3f" % accuracy_score(y, run_cv(X, y, KNN))


Support vector machines:
0.918

Random forest:
0.944

K-nearest-neighbours:
0.893

In [9]:
# confusion matrix
# The x-axis indicates the true class of each observation (if a customer churned or not) 
# while the y-axis corresponds to the class predicted by the model (if my classifier said a customer would churned or not).
from sklearn.metrics import confusion_matrix

y = np.array(y)
class_names = np.unique(y)

confusion_matrices = [
    ("Support Vector Machine", confusion_matrix(y, run_cv(X, y, SVC))),
    ("Random Forest", confusion_matrix(y, run_cv(X, y, RF))),
    ("K-Nearest-Neighbour", confusion_matrix(y, run_cv(X, y, KNN))),
]

In [10]:
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline


def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    tick_marks = np.arange(len(class_names))
    outcome_annotation = dict(fc='w',boxstyle='round, pad=1')
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar(fraction=0.045)
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    plt.tight_layout()
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    
    # outcomes
    tn = cm[0, 0]; fp = cm[0, 1]; fn = cm[1, 0]; tp = cm[1, 1]; nobs = cm.sum()
    
    # true negative annotation
    plt.text(0, 0, 'TN: %d' % tn, va='center', ha='center', bbox=outcome_annotation)
    
    # false negative annotation
    plt.text(0, 1, 'FN: %d' % fn, va='center', ha='center', bbox=outcome_annotation)
    
    # false positive annotation
    plt.text(1, 0, 'FP: %d' % fp, va='center', ha='center', bbox=outcome_annotation)
    
    # true positive annotation
    plt.text(1, 1, 'TP: %d' % tp, va='center', ha='center', bbox=outcome_annotation)


    
def draw_confusion_matrices(confusion_matrices, class_names):
    fig = plt.figure(num = 1, figsize=(14, 8))

    for index, cm in enumerate(confusion_matrices):
        fig.add_subplot(130 + index + 1)
        plot_confusion_matrix(cm[1], cm[0])
        
    plt.show()

In [11]:
draw_confusion_matrices(confusion_matrices, class_names)

for index, item in enumerate(confusion_matrices):
    name = item[0]
    cm = item[1]
    
    # outcomes
    tn = cm[0, 0]; fp = cm[0, 1]; fn = cm[1, 0]; tp = cm[1, 1]; nobs = cm.sum()

    # performance metrics
    actual_positive = tp + fn + 0.
    actual_negative = fp + tn + 0.
    predicted_positive = tp + fp + 0.
    accuracy = (tp + tn + 0.) / nobs
    error_rate = 1. - accuracy
    recall = tp / actual_positive
    false_positive_rate = fp / actual_negative
    false_negative_rate = fn / actual_positive
    specificity = 1. - false_positive_rate
    precision = tp / predicted_positive
    prevelance = actual_positive / nobs
    positive_likelihood_ratio = recall / false_positive_rate
    negative_likelihood_ratio = false_negative_rate / specificity
    diagnostic_odds_ratio = positive_likelihood_ratio / negative_likelihood_ratio

    print "\n" + name
    print "=" * len(name)
    print "\naccuracy\t\t\t%.2f%%" % (accuracy * 100) + "\t\tOverall, how often is the classifier correct?"
    print "\nerror_rate\t\t\t%.2f%%" % (error_rate * 100) + "\t\tOverall, how often is it wrong?"
    print "\nrecall\t\t\t\t%.2f%%" % (recall * 100) + "\t\tWhen it's actually yes, how often does it predict yes?"
    print "\nfalse_positive_rate\t\t%.2f%%" % (false_positive_rate * 100) + "\t\tWhen it's actually no, how often does it predict yes? (Type I error)"
    print "\nfalse_negative_rate\t\t%.2f%%" % (false_negative_rate * 100) + "\t\tWhen it's actually yes, how often does it predict no? (Type II error)"
    print "\nspecificity\t\t\t%.2f%%" % (specificity * 100) + "\t\tWhen it's actually no, how often does it predict no?"
    print "\nprecision\t\t\t%.2f%%" % (precision * 100) + "\t\tWhen it predicts yes, how often is it correct?"
    print "\nprevelance\t\t\t%.2f%%" % (prevelance * 100) + "\t\tHow often does the yes condition actually occur in our sample?"
    print "\npositive_likelihood_ratio\t%.2f" % positive_likelihood_ratio + "\t\t"
    print "\nnegative_likelihood_ratio\t%.2f" % negative_likelihood_ratio + "\t\t"
    print "\ndiagnostic_odds_ratio\t\t%.2f" % (diagnostic_odds_ratio) + "\t\tHigher diagnostic odds ratios are indicative of better test performance"
    print "\n"


Support Vector Machine
======================

accuracy			91.75%		Overall, how often is the classifier correct?

error_rate			8.25%		Overall, how often is it wrong?

recall				50.72%		When it's actually yes, how often does it predict yes?

false_positive_rate		1.30%		When it's actually no, how often does it predict yes? (Type I error)

false_negative_rate		49.28%		When it's actually yes, how often does it predict no? (Type II error)

specificity			98.70%		When it's actually no, how often does it predict no?

precision			86.88%		When it predicts yes, how often is it correct?

prevelance			14.49%		How often does the yes condition actually occur in our sample?

positive_likelihood_ratio	39.07		

negative_likelihood_ratio	0.50		

diagnostic_odds_ratio		78.26		Higher diagnostic odds ratios are indicative of better test performance



Random Forest
=============

accuracy			94.33%		Overall, how often is the classifier correct?

error_rate			5.67%		Overall, how often is it wrong?

recall				66.25%		When it's actually yes, how often does it predict yes?

false_positive_rate		0.91%		When it's actually no, how often does it predict yes? (Type I error)

false_negative_rate		33.75%		When it's actually yes, how often does it predict no? (Type II error)

specificity			99.09%		When it's actually no, how often does it predict no?

precision			92.49%		When it predicts yes, how often is it correct?

prevelance			14.49%		How often does the yes condition actually occur in our sample?

positive_likelihood_ratio	72.62		

negative_likelihood_ratio	0.34		

diagnostic_odds_ratio		213.23		Higher diagnostic odds ratios are indicative of better test performance



K-Nearest-Neighbour
===================

accuracy			89.62%		Overall, how often is the classifier correct?

error_rate			10.38%		Overall, how often is it wrong?

recall				38.51%		When it's actually yes, how often does it predict yes?

false_positive_rate		1.72%		When it's actually no, how often does it predict yes? (Type I error)

false_negative_rate		61.49%		When it's actually yes, how often does it predict no? (Type II error)

specificity			98.28%		When it's actually no, how often does it predict no?

precision			79.15%		When it predicts yes, how often is it correct?

prevelance			14.49%		How often does the yes condition actually occur in our sample?

positive_likelihood_ratio	22.40		

negative_likelihood_ratio	0.63		

diagnostic_odds_ratio		35.80		Higher diagnostic odds ratios are indicative of better test performance