notebook.community

Edit and run



In [1]:

    
#Read and show the sample data
from __future__ import division
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist()

print "Column names:"
print col_names

to_show = col_names[:6] + col_names[-6:]

print "\nSample data:"
churn_df[to_show].head(6)









    



Column names:
['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']

Sample data:






    Out[1]:







  
    
      
      State
      Account Length
      Area Code
      Phone
      Int'l Plan
      VMail Plan
      Night Charge
      Intl Mins
      Intl Calls
      Intl Charge
      CustServ Calls
      Churn?
    
  
  
    
      0
      KS
      128
      415
      382-4657
      no
      yes
      11.01
      10.0
      3
      2.70
      1
      False.
    
    
      1
      OH
      107
      415
      371-7191
      no
      yes
      11.45
      13.7
      3
      3.70
      1
      False.
    
    
      2
      NJ
      137
      415
      358-1921
      no
      no
      7.32
      12.2
      5
      3.29
      0
      False.
    
    
      3
      OH
      84
      408
      375-9999
      yes
      no
      8.86
      6.6
      7
      1.78
      2
      False.
    
    
      4
      OK
      75
      415
      330-6626
      yes
      no
      8.41
      10.1
      3
      2.73
      3
      False.
    
    
      5
      AL
      118
      510
      391-8027
      yes
      no
      9.18
      6.3
      6
      1.70
      0
      False.



In [2]:

    
# Isolate target data
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)

# We don't need these columns
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)

# 'yes'/'no' has to be converted to boolean values
# NumPy converts these from boolean to 1. and 0. later
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

# Pull out features for future use
features = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)

# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#standardize features by removing mean and scaling to unit variance
X = scaler.fit_transform(X)

print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)









    



Feature space holds 3333 observations and 17 features
Unique target labels: [0 1]



In [3]:

    
from sklearn.cross_validation import KFold
# to do k fold analysis
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred









    



/home/menikhilpandey/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [4]:

    
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)
#to determine accuracy
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))









    



Support vector machines:
0.918
Random forest:
0.946
K-nearest-neighbors:
0.892



In [5]:

    
from sklearn.metrics import confusion_matrix
y = np.array(y)
class_names = np.unique(y)

confusion_matrices = [
    ( "Support Vector Machines", confusion_matrix(y,run_cv(X,y,SVC)) ),
    ( "Random Forest", confusion_matrix(y,run_cv(X,y,RF)) ),
    ( "K-Nearest-Neighbors", confusion_matrix(y,run_cv(X,y,KNN)) ),
]

for matrix in confusion_matrices:
    print matrix[0]
    print
    print '\t','P0','\t','P1'
    print 'A0\t',matrix[1][0][0],'\t',matrix[1][0][1]
    print 'A1\t',matrix[1][1][0],'\t',matrix[1][1][1]
    print 
    print "Recall = %.2f"%(matrix[1][1][1]/sum(matrix[1][1]))
    print "Precision = %.2f"%(matrix[1][1][1]/(matrix[1][1][1]+matrix[1][0][1]))
    print









    



Support Vector Machines

	P0 	P1
A0	2814 	36
A1	238 	245

Recall = 0.51
Precision = 0.87

Random Forest

	P0 	P1
A0	2827 	23
A1	152 	331

Recall = 0.69
Precision = 0.94

K-Nearest-Neighbors

	P0 	P1
A0	2803 	47
A1	313 	170

Recall = 0.35
Precision = 0.78



In [6]:

    
def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob



In [7]:

    
# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
pred_churn = pred_prob[:,1]
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)

# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

# pandas-fu
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts



In [8]:

    
from sklearn.cross_validation import train_test_split

train_index,test_index = train_test_split(churn_df.index)
clf = SVC(probability=True)
clf.fit(X[train_index],y[train_index])
test_churn_df = churn_df.ix[test_index]
test_churn_df.to_csv("binary_churn.csv")



In [9]:

    
class ChurnModel:
    def execute(self,data):
        # Collect customer meta data
        response = data[['Area Code','Phone']]
        charges = ['Day Charge','Eve Charge','Night Charge','Intl Charge']
        response['customer_worth'] = data[charges].sum(axis=1)
        data[yes_no_cols] = data[yes_no_cols] == 'yes'
        # Create feature space
        X = data[features].as_matrix().astype(float)
        X = scaler.transform(X)
        # Make prediction
        churn_prob = clf.predict_proba(X)
        response['churn_prob'] = churn_prob[:,1]
        # Calculate expected loss by churn
        response['expected_loss'] = response['churn_prob'] * response['customer_worth']
        # Return response DataFrame
        return response

model_output = ChurnModel().execute(churn_df)
processed_data = model_output.sort_values('expected_loss',ascending=True).iloc[::-1]
processed_data.to_csv("probability_churn.csv",index=False)

	pred_prob	count	true_prob
0	0.0	1721	0.028472
1	0.1	742	0.022911
2	0.2	251	0.043825
3	0.3	126	0.166667
4	0.4	85	0.341176
5	0.6	80	0.825000
6	0.8	79	0.974684
7	0.9	72	0.972222
8	0.5	64	0.562500
9	0.7	61	0.901639
10	1.0	52	1.000000

	State	Account Length	Area Code	Phone	Int'l Plan	VMail Plan	Night Charge	Intl Mins	Intl Calls	Intl Charge	CustServ Calls	Churn?
0	KS	128	415	382-4657	no	yes	11.01	10.0	3	2.70	1	False.
1	OH	107	415	371-7191	no	yes	11.45	13.7	3	3.70	1	False.
2	NJ	137	415	358-1921	no	no	7.32	12.2	5	3.29	0	False.
3	OH	84	408	375-9999	yes	no	8.86	6.6	7	1.78	2	False.
4	OK	75	415	330-6626	yes	no	8.41	10.1	3	2.73	3	False.
5	AL	118	510	391-8027	yes	no	9.18	6.3	6	1.70	0	False.