In [1]:
#Read and show the sample data
from __future__ import division
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist()

print "Column names:"
print col_names

to_show = col_names[:6] + col_names[-6:]

print "\nSample data:"
churn_df[to_show].head(6)


Column names:
['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']

Sample data:
Out[1]:
State Account Length Area Code Phone Int'l Plan VMail Plan Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn?
0 KS 128 415 382-4657 no yes 11.01 10.0 3 2.70 1 False.
1 OH 107 415 371-7191 no yes 11.45 13.7 3 3.70 1 False.
2 NJ 137 415 358-1921 no no 7.32 12.2 5 3.29 0 False.
3 OH 84 408 375-9999 yes no 8.86 6.6 7 1.78 2 False.
4 OK 75 415 330-6626 yes no 8.41 10.1 3 2.73 3 False.
5 AL 118 510 391-8027 yes no 9.18 6.3 6 1.70 0 False.

In [2]:
# Isolate target data
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)

# We don't need these columns
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)

# 'yes'/'no' has to be converted to boolean values
# NumPy converts these from boolean to 1. and 0. later
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

# Pull out features for future use
features = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)

# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#standardize features by removing mean and scaling to unit variance
X = scaler.fit_transform(X)

print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)


Feature space holds 3333 observations and 17 features
Unique target labels: [0 1]

In [3]:
from sklearn.cross_validation import KFold
# to do k fold analysis
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred


/home/menikhilpandey/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [4]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)
#to determine accuracy
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))


Support vector machines:
0.918
Random forest:
0.946
K-nearest-neighbors:
0.892

In [5]:
from sklearn.metrics import confusion_matrix
y = np.array(y)
class_names = np.unique(y)

confusion_matrices = [
    ( "Support Vector Machines", confusion_matrix(y,run_cv(X,y,SVC)) ),
    ( "Random Forest", confusion_matrix(y,run_cv(X,y,RF)) ),
    ( "K-Nearest-Neighbors", confusion_matrix(y,run_cv(X,y,KNN)) ),
]

for matrix in confusion_matrices:
    print matrix[0]
    print
    print '\t','P0','\t','P1'
    print 'A0\t',matrix[1][0][0],'\t',matrix[1][0][1]
    print 'A1\t',matrix[1][1][0],'\t',matrix[1][1][1]
    print 
    print "Recall = %.2f"%(matrix[1][1][1]/sum(matrix[1][1]))
    print "Precision = %.2f"%(matrix[1][1][1]/(matrix[1][1][1]+matrix[1][0][1]))
    print


Support Vector Machines

	P0 	P1
A0	2814 	36
A1	238 	245

Recall = 0.51
Precision = 0.87

Random Forest

	P0 	P1
A0	2827 	23
A1	152 	331

Recall = 0.69
Precision = 0.94

K-Nearest-Neighbors

	P0 	P1
A0	2803 	47
A1	313 	170

Recall = 0.35
Precision = 0.78


In [6]:
def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob

In [7]:
# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
pred_churn = pred_prob[:,1]
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)

# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

# pandas-fu
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts


Out[7]:
pred_prob count true_prob
0 0.0 1721 0.028472
1 0.1 742 0.022911
2 0.2 251 0.043825
3 0.3 126 0.166667
4 0.4 85 0.341176
5 0.6 80 0.825000
6 0.8 79 0.974684
7 0.9 72 0.972222
8 0.5 64 0.562500
9 0.7 61 0.901639
10 1.0 52 1.000000

In [8]:
from sklearn.cross_validation import train_test_split

train_index,test_index = train_test_split(churn_df.index)
clf = SVC(probability=True)
clf.fit(X[train_index],y[train_index])
test_churn_df = churn_df.ix[test_index]
test_churn_df.to_csv("binary_churn.csv")

In [9]:
class ChurnModel:
    def execute(self,data):
        # Collect customer meta data
        response = data[['Area Code','Phone']]
        charges = ['Day Charge','Eve Charge','Night Charge','Intl Charge']
        response['customer_worth'] = data[charges].sum(axis=1)
        data[yes_no_cols] = data[yes_no_cols] == 'yes'
        # Create feature space
        X = data[features].as_matrix().astype(float)
        X = scaler.transform(X)
        # Make prediction
        churn_prob = clf.predict_proba(X)
        response['churn_prob'] = churn_prob[:,1]
        # Calculate expected loss by churn
        response['expected_loss'] = response['churn_prob'] * response['customer_worth']
        # Return response DataFrame
        return response

model_output = ChurnModel().execute(churn_df)
processed_data = model_output.sort_values('expected_loss',ascending=True).iloc[::-1]
processed_data.to_csv("probability_churn.csv",index=False)