In [1]:
#Read and show the sample data
from __future__ import division
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist()
print "Column names:"
print col_names
to_show = col_names[:6] + col_names[-6:]
print "\nSample data:"
churn_df[to_show].head(6)
Out[1]:
In [2]:
# Isolate target data
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)
# We don't need these columns
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)
# 'yes'/'no' has to be converted to boolean values
# NumPy converts these from boolean to 1. and 0. later
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
# Pull out features for future use
features = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)
# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#standardize features by removing mean and scaling to unit variance
X = scaler.fit_transform(X)
print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)
In [3]:
from sklearn.cross_validation import KFold
# to do k fold analysis
def run_cv(X,y,clf_class,**kwargs):
# Construct a kfolds object
kf = KFold(len(y),n_folds=5,shuffle=True)
y_pred = y.copy()
# Iterate through folds
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
# Initialize a classifier with key word arguments
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
return y_pred
In [4]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
def accuracy(y_true,y_pred):
# NumPy interprets True and False as 1. and 0.
return np.mean(y_true == y_pred)
#to determine accuracy
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))
In [5]:
from sklearn.metrics import confusion_matrix
y = np.array(y)
class_names = np.unique(y)
confusion_matrices = [
( "Support Vector Machines", confusion_matrix(y,run_cv(X,y,SVC)) ),
( "Random Forest", confusion_matrix(y,run_cv(X,y,RF)) ),
( "K-Nearest-Neighbors", confusion_matrix(y,run_cv(X,y,KNN)) ),
]
for matrix in confusion_matrices:
print matrix[0]
print
print '\t','P0','\t','P1'
print 'A0\t',matrix[1][0][0],'\t',matrix[1][0][1]
print 'A1\t',matrix[1][1][0],'\t',matrix[1][1][1]
print
print "Recall = %.2f"%(matrix[1][1][1]/sum(matrix[1][1]))
print "Precision = %.2f"%(matrix[1][1][1]/(matrix[1][1][1]+matrix[1][0][1]))
print
In [6]:
def run_prob_cv(X, y, clf_class, **kwargs):
kf = KFold(len(y), n_folds=5, shuffle=True)
y_prob = np.zeros((len(y),2))
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
clf = clf_class(**kwargs)
clf.fit(X_train,y_train)
# Predict probabilities, not classes
y_prob[test_index] = clf.predict_proba(X_test)
return y_prob
In [7]:
# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
pred_churn = pred_prob[:,1]
is_churn = y == 1
# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
# calculate true probabilities
true_prob = {}
for prob in counts.index:
true_prob[prob] = np.mean(is_churn[pred_churn == prob])
true_prob = pd.Series(true_prob)
# pandas-fu
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts
Out[7]:
In [8]:
from sklearn.cross_validation import train_test_split
train_index,test_index = train_test_split(churn_df.index)
clf = SVC(probability=True)
clf.fit(X[train_index],y[train_index])
test_churn_df = churn_df.ix[test_index]
test_churn_df.to_csv("binary_churn.csv")
In [9]:
class ChurnModel:
def execute(self,data):
# Collect customer meta data
response = data[['Area Code','Phone']]
charges = ['Day Charge','Eve Charge','Night Charge','Intl Charge']
response['customer_worth'] = data[charges].sum(axis=1)
data[yes_no_cols] = data[yes_no_cols] == 'yes'
# Create feature space
X = data[features].as_matrix().astype(float)
X = scaler.transform(X)
# Make prediction
churn_prob = clf.predict_proba(X)
response['churn_prob'] = churn_prob[:,1]
# Calculate expected loss by churn
response['expected_loss'] = response['churn_prob'] * response['customer_worth']
# Return response DataFrame
return response
model_output = ChurnModel().execute(churn_df)
processed_data = model_output.sort_values('expected_loss',ascending=True).iloc[::-1]
processed_data.to_csv("probability_churn.csv",index=False)