Churn Prediction (RandomForest classification)



In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
pd.set_option('display.max_columns', None)



In [3]:

    
churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist()

Columns



In [4]:

    
col_names









    Out[4]:





['State',
 'Account Length',
 'Area Code',
 'Phone',
 "Int'l Plan",
 'VMail Plan',
 'VMail Message',
 'Day Mins',
 'Day Calls',
 'Day Charge',
 'Eve Mins',
 'Eve Calls',
 'Eve Charge',
 'Night Mins',
 'Night Calls',
 'Night Charge',
 'Intl Mins',
 'Intl Calls',
 'Intl Charge',
 'CustServ Calls',
 'Churn?']



In [5]:

    
churn_df.head()









    Out[5]:






  
    
      
      State
      Account Length
      Area Code
      Phone
      Int'l Plan
      VMail Plan
      VMail Message
      Day Mins
      Day Calls
      Day Charge
      Eve Mins
      Eve Calls
      Eve Charge
      Night Mins
      Night Calls
      Night Charge
      Intl Mins
      Intl Calls
      Intl Charge
      CustServ Calls
      Churn?
    
  
  
    
      0
       KS
       128
       415
       382-4657
        no
       yes
       25
       265.1
       110
       45.07
       197.4
        99
       16.78
       244.7
        91
       11.01
       10.0
       3
       2.70
       1
       False.
    
    
      1
       OH
       107
       415
       371-7191
        no
       yes
       26
       161.6
       123
       27.47
       195.5
       103
       16.62
       254.4
       103
       11.45
       13.7
       3
       3.70
       1
       False.
    
    
      2
       NJ
       137
       415
       358-1921
        no
        no
        0
       243.4
       114
       41.38
       121.2
       110
       10.30
       162.6
       104
        7.32
       12.2
       5
       3.29
       0
       False.
    
    
      3
       OH
        84
       408
       375-9999
       yes
        no
        0
       299.4
        71
       50.90
        61.9
        88
        5.26
       196.9
        89
        8.86
        6.6
       7
       1.78
       2
       False.
    
    
      4
       OK
        75
       415
       330-6626
       yes
        no
        0
       166.7
       113
       28.34
       148.3
       122
       12.61
       186.9
       121
        8.41
       10.1
       3
       2.73
       3
       False.
    
  

5 rows × 21 columns



In [6]:

    
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)

# We don't need these columns
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)

# 'yes'/'no' has to be converted to boolean values
# NumPy converts these from boolean to 1. and 0. later
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'



In [7]:

    
features = churn_feat_space.columns

X = churn_feat_space.as_matrix().astype(np.float)

# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)









    



Feature space holds 3333 observations and 17 features
Unique target labels: [0 1]



In [8]:

    
kf = KFold(len(y),n_folds=5,shuffle=True)
y_pred = y.copy()
for train_index, test_index in kf:
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    clf = RandomForestClassifier(n_jobs=3)
    clf.fit(X_train,y_train)
    y_pred[test_index] = clf.predict(X_test)

def accuracy(y_true,y_pred):
    return np.mean(y_true == y_pred)

print accuracy(y, y_pred)









    



0.946594659466



In [9]:

    
fet_imp = clf.feature_importances_



In [10]:

    
fig = plt.figure(figsize=(8,4))
ax = plt.subplot(111)
plt.bar(np.arange(len(fet_imp)), fet_imp, width=1, lw=2)
plt.grid(False)
ax.set_xticks(np.arange(len(fet_imp))+.5)
ax.set_xticklabels(churn_feat_space.columns)
plt.setp(ax.get_xticklabels(), rotation='vertical', fontsize=14)
plt.xlim(0, len(fet_imp))









    Out[10]:





(0, 17)



In [11]:

    
def draw_cfm(y_true,y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.matshow(cm)
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.colorbar()

def cfm(y_true,y_pred):
    return confusion_matrix(y_true, y_pred)

def report(y_true,y_pred):
    return classification_report(y_true, y_pred)

print report(y, y_pred)
print cfm(y, y_pred)
draw_cfm(y, y_pred)









    



             precision    recall  f1-score   support

          0       0.95      0.99      0.97      2850
          1       0.95      0.67      0.78       483

avg / total       0.95      0.95      0.94      3333

[[2832   18]
 [ 160  323]]

	State	Account Length	Area Code	Phone	Int'l Plan	VMail Plan	VMail Message	Day Mins	Day Calls	Day Charge	Eve Mins	Eve Calls	Eve Charge	Night Mins	Night Calls	Night Charge	Intl Mins	Intl Calls	Intl Charge	CustServ Calls	Churn?
0	KS	128	415	382-4657	no	yes	25	265.1	110	45.07	197.4	99	16.78	244.7	91	11.01	10.0	3	2.70	1	False.
1	OH	107	415	371-7191	no	yes	26	161.6	123	27.47	195.5	103	16.62	254.4	103	11.45	13.7	3	3.70	1	False.
2	NJ	137	415	358-1921	no	no	0	243.4	114	41.38	121.2	110	10.30	162.6	104	7.32	12.2	5	3.29	0	False.
3	OH	84	408	375-9999	yes	no	0	299.4	71	50.90	61.9	88	5.26	196.9	89	8.86	6.6	7	1.78	2	False.
4	OK	75	415	330-6626	yes	no	0	166.7	113	28.34	148.3	122	12.61	186.9	121	8.41	10.1	3	2.73	3	False.