Churn Prediction (RandomForest classification)


In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
pd.set_option('display.max_columns', None)

In [3]:
churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist()

Columns


In [4]:
col_names


Out[4]:
['State',
 'Account Length',
 'Area Code',
 'Phone',
 "Int'l Plan",
 'VMail Plan',
 'VMail Message',
 'Day Mins',
 'Day Calls',
 'Day Charge',
 'Eve Mins',
 'Eve Calls',
 'Eve Charge',
 'Night Mins',
 'Night Calls',
 'Night Charge',
 'Intl Mins',
 'Intl Calls',
 'Intl Charge',
 'CustServ Calls',
 'Churn?']

In [5]:
churn_df.head()


Out[5]:
State Account Length Area Code Phone Int'l Plan VMail Plan VMail Message Day Mins Day Calls Day Charge Eve Mins Eve Calls Eve Charge Night Mins Night Calls Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn?
0 KS 128 415 382-4657 no yes 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 False.
1 OH 107 415 371-7191 no yes 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 False.
2 NJ 137 415 358-1921 no no 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 False.
3 OH 84 408 375-9999 yes no 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 False.
4 OK 75 415 330-6626 yes no 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 False.

5 rows × 21 columns


In [6]:
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)

# We don't need these columns
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)

# 'yes'/'no' has to be converted to boolean values
# NumPy converts these from boolean to 1. and 0. later
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

In [7]:
features = churn_feat_space.columns

X = churn_feat_space.as_matrix().astype(np.float)

# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)


Feature space holds 3333 observations and 17 features
Unique target labels: [0 1]

In [8]:
kf = KFold(len(y),n_folds=5,shuffle=True)
y_pred = y.copy()
for train_index, test_index in kf:
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    clf = RandomForestClassifier(n_jobs=3)
    clf.fit(X_train,y_train)
    y_pred[test_index] = clf.predict(X_test)

def accuracy(y_true,y_pred):
    return np.mean(y_true == y_pred)

print accuracy(y, y_pred)


0.946594659466

In [9]:
fet_imp = clf.feature_importances_

In [10]:
fig = plt.figure(figsize=(8,4))
ax = plt.subplot(111)
plt.bar(np.arange(len(fet_imp)), fet_imp, width=1, lw=2)
plt.grid(False)
ax.set_xticks(np.arange(len(fet_imp))+.5)
ax.set_xticklabels(churn_feat_space.columns)
plt.setp(ax.get_xticklabels(), rotation='vertical', fontsize=14)
plt.xlim(0, len(fet_imp))


Out[10]:
(0, 17)

In [11]:
def draw_cfm(y_true,y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.matshow(cm)
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.colorbar()

def cfm(y_true,y_pred):
    return confusion_matrix(y_true, y_pred)

def report(y_true,y_pred):
    return classification_report(y_true, y_pred)

print report(y, y_pred)
print cfm(y, y_pred)
draw_cfm(y, y_pred)


             precision    recall  f1-score   support

          0       0.95      0.99      0.97      2850
          1       0.95      0.67      0.78       483

avg / total       0.95      0.95      0.94      3333

[[2832   18]
 [ 160  323]]