In [368]:
# Data Wrangling & Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
%matplotlib inline
# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Feature Selection
from sklearn.feature_selection import RFE
# Modeling
from sklearn.linear_model import LogisticRegressionCV, LassoCV
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
# Parameter tuning
from sklearn import grid_search
# Validation
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import confusion_matrix, classification_report
In [362]:
# Download dataset
# ! wget https://raw.githubusercontent.com/EricChiang/churn/master/data/churn.csv
# ! mkdir data
# ! mv churn.csv ./data/churn.csv
In [260]:
# Creating churn dataframe
churn = pd.read_csv('data/churn.csv')
In [261]:
# Column name cleanup
churn.rename(columns = {'Churn?': 'Churn'}, inplace=True)
churn.rename(columns = {"Int'l Plan":"Intl Plan"}, inplace=True)
In [262]:
churn.info()
In [263]:
churn.head(2)
Out[263]:
In [264]:
# Binarizing all columns at once
churn.replace('no', 0, inplace=True)
churn.replace('yes', 1, inplace=True)
churn.replace('False.', 0, inplace=True)
churn.replace('True.', 1, inplace=True)
In [265]:
X_exploratory = churn.copy()
# Dropping customer contact columns
customer_contact = churn[['Area Code', 'Phone', 'State']]
In [266]:
# Encoding State into numerical representation
state_encoder = LabelEncoder()
churn.State = state_encoder.fit_transform(churn.State)
In [267]:
# Dropping customer contact columns
drop_cols = ['Area Code', 'Phone', 'State']
churn.drop(drop_cols, axis=1, inplace=True)
In [268]:
# Setting up for modeling
y = churn.Churn
X = churn.drop("Churn", axis=1, inplace=True)
X = churn
columns = X.columns
In [269]:
X.shape
Out[269]:
In [270]:
y.shape
Out[270]:
In [392]:
# Class Imbalance
X_exploratory.groupby('Churn')['Churn'].count()
Out[392]:
In [403]:
X_exploratory.groupby('Churn')['Churn'].count().plot(kind='bar')
plt.title('Churn Class Imbalance')
sns.despine();
In [369]:
# No extreme churn distribution shift by account length; no outliers
X_exploratory.boxplot(column='Account Length', by='Churn');
In [408]:
# Normal Distribution for Account Lengths
sns.violinplot(x=X_exploratory.Churn, y=X_exploratory['Account Length'])
plt.title('Account Length by Churn')
sns.despine();
In [371]:
# Day Mins is definitely predictive; No outliers to consider filtering
X_exploratory.boxplot(column="Day Mins", by="Churn");
In [409]:
# Interestingly bimodal distribution on churn customers; they either talked a lot or not much
sns.violinplot(x=X_exploratory.Churn, y=X_exploratory['Day Mins'])
plt.title('Day Mins by Churn')
sns.despine();
In [370]:
churn2 = X_exploratory.groupby(['State','Churn'])['Churn'].count().unstack()
churn2.plot(kind='barh',stacked=True, figsize=(16,12), title='Customers Churned By State')
sns.despine();
In [415]:
churn_subset1 = X_exploratory[['Account Length', 'Intl Plan', 'VMail Plan', 'CustServ Calls', 'Churn']]
In [412]:
churn_subset2 = X_exploratory[['Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', \
'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', \
'Intl Calls', 'Intl Charge', 'Churn']]
In [416]:
sns.pairplot(churn_subset1, hue='Churn');
In [413]:
sns.pairplot(churn_subset2, hue='Churn');
Expected perfect linear relationships between minute and charges
Normalizing scale of numerical features; particularly important for linear models and SVMs
In [169]:
scaler = StandardScaler()
In [170]:
X = scaler.fit_transform(X)
In [70]:
# L1 regularization penalizes and removes unpredictive features
lasso = LassoCV()
In [71]:
lasso.fit(X, y)
Out[71]:
In [72]:
columns
Out[72]:
In [73]:
zip(enumerate(columns),lasso.coef_)
Out[73]:
In [35]:
# zip(enumerate(columns),lasso.coef_)
In [34]:
X_LassoSubset = X[:,[0,1,2,4,5,7,10,12,13,14,15,16]]
Lasso removed some of the features with perfect linearity between each other; creating subset of X for modeling stage
In [74]:
# Baseline
model_scoring(X, y, LogisticRegressionCV())
In [75]:
model_scoring(X_LassoSubset, y, LogisticRegressionCV())
Performance-wise, Lasso struggles with the binary features and does not identify churn cases very well
In [77]:
logit = LogisticRegressionCV()
In [78]:
# Using sklearn feature selector
feature_selector = RFE(estimator=logit, step=1).fit(X, y)
In [320]:
# Listing all columns for reference
columns
Out[320]:
In [79]:
# Columns with a 1 are predictive
zip(feature_selector.ranking_,columns)
Out[79]:
In [58]:
# zip(feature_selector.ranking_,columns)
In [42]:
X_LogitSubset = X[:,[1,2,4,6,7,14,15,16]]
In [74]:
# Baseline
model_scoring(X, y, LogisticRegressionCV())
Modest improvement with smaller subset of features
In [83]:
model_scoring(X_LogitSubset, y, LogisticRegressionCV())
Overall, logistic regression served as an important baseline and feel for important features but it ultimately performed poorly due to non-linear relationships present in data and presence of several binary features
In [280]:
def trees_featureImportances(X, y, model):
"""Returns Feature Importances on Tree-based Classifiers along with visualization"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
classifier = model
classifier.fit(X_train, y_train)
classifier_report = model_scoring(X, y, model)
feature_importances = classifier.feature_importances_
feature_ranks = np.argsort(feature_importances)[::-1]
rank_std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0)
plt.figure(figsize=(16,8))
plt.bar(np.arange(X.shape[1]), feature_importances[feature_ranks], yerr= rank_std[feature_ranks], color='r')
plt.xticks(np.arange(X.shape[1]) + 0.25, columns[feature_ranks], rotation=45)
plt.title('Features by Predictive Importance')
plt.ylabel('Relative Feature Importance');
print "{}".format(classifier_report)
print "Feature Importances: "
for f in range(X.shape[1]):
print "Feature {}: {}, Relative Importance {:.3f}, Std: {:.3f}".format(f+1, columns[feature_ranks[f]], feature_importances[feature_ranks[f]], rank_std[feature_ranks[f]])
In [327]:
trees_featureImportances(X_LassoSubset, y, RandomForestClassifier(n_estimators=100))
Note: Dropped state column at this point. 2 out of 3 feature selection methods did not find it important and there is too many factors wrapped up in state (e.g. incomes, coverage infrastructure, etc.) to make it worth keeping.
In [ ]:
# Thinking Logistic Regression, Random Forest / other tree-based model because of mixed feature types, and one-class
# SVMs due to unbalanced class issue
In [20]:
# Printing columns again for reference
columns
Out[20]:
In [25]:
MODELS = {'Random Forest': RandomForestClassifier(), \
'Support Vector Machine': svm.SVC()}
PARAMETERS = {'Random Forest':{'n_estimators': [10,20,50,100]}, \
'Support Vector Machine': {'kernel':['linear', 'rbf'], 'C':[1,3,5,7]}}
def gridsearch_models(models, parameters, scoring_methods):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=0)
for name, model in models.items():
for score in scoring_methods:
print "Tuning {} model for {} score \n".format(name, score)
optimal_model = grid_search.GridSearchCV(model, parameters[name], cv=5, scoring='%s_weighted' % score)
optimal_model.fit(X_train, y_train)
print "Best parameters are: \n {} \n".format(optimal_model.best_params_)
print "Grid scores on test set: \n"
for params, mean_score, scores in optimal_model.grid_scores_:
print("{:.3f}, std: {:.3f} for {}".format(mean_score, scores.std() * 2, params))
print "\n Classification Report: \n {} \n".format(classification_report(y_test, optimal_model.predict(X_test)))
In [26]:
gridsearch_models(MODELS, PARAMETERS, ['f1','precision', 'recall'])
Conscious choice not to artificially balance classes to control for in-sample class imbalance due to assumption that sample is representative of the population; my focus is more on making a model more heavily weighted toward generalizing out-of-sample
In [171]:
def model_scoring(X, y, model):
kf = KFold(len(X), n_folds=10, shuffle=True)
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
classifier = model
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print "Classification Report (10-fold cv): \n\n {}".format(classification_report(y_test, y_pred))
Random Forest Classifier: Performance on X, X_LassoSubset, X_LogitSubset
In [172]:
model_scoring(X, y, RandomForestClassifier(n_estimators=100))
In [173]:
model_scoring(X_LassoSubset, y, RandomForestClassifier(n_estimators=100))
In [174]:
model_scoring(X_LogitSubset, y, RandomForestClassifier(n_estimators=100))
SVM: Performance on X, X_LassoSubset, X_LogitSubset
In [175]:
model_scoring(X, y, svm.SVC(C=5, kernel='rbf'))
In [176]:
model_scoring(X_LassoSubset, y, svm.SVC(C=5, kernel='rbf'))
In [177]:
model_scoring(X_LogitSubset, y, svm.SVC(C=5, kernel='rbf'))
I selected the Random Forest Classifier because of its high predictive scores, particularly its recall score (proportion of churn cases correctly identified as having churned). While the subset of features produced by Lasso Regression was competitive, I ultimately opted for the inclusion of all non customer contact related features. Also, because of the model's interpretibility and ability to parallelize. Ultimately, more data is needed to determine the ability of my model to generalize well
In [249]:
trees_featureImportances(X,y, RandomForestClassifier(n_estimators=100))
In [273]:
rf_cm = confusion_matrix(y_test, rf.predict(X_test))
print rf_cm
In [354]:
def plot_confusion_matrix(cm, labels):
"""Method to plot confusion matrix"""
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm, cmap=plt.cm.Blues)
plt.title('Random Forest Confusion Matrix\n')
plt.tight_layout()
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
for i,j in ((x,y) for x in xrange(2) for y in xrange(2)):
ax.annotate(str(cm[i][j]), xy=(i,j), color='red')
plt.show()
In [355]:
plot_confusion_matrix(rf_cm, ['No_Churn','Churn'])
In [217]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.34, random_state=0)
In [218]:
rf = RandomForestClassifier(n_estimators=100)
In [219]:
rf.fit(X_train, y_train)
Out[219]:
In [220]:
model_scoring(X, y, rf)
In [246]:
# customer_worth = total charges
# sort(expected losses) = customer_worth * churn_probability
customer_LTV = np.zeros((len(y),1))
customer_LTV[:,0] = churn['Day Charge'] + churn['Eve Charge'] + churn['Intl Charge'] + churn['Night Charge']
In [247]:
priorities = pd.DataFrame(data=customer_LTV,columns=['Customer_LTV'])
In [248]:
churn.head(2)
Out[248]:
In [249]:
# Computing predicted probabilities
pred_probs = rf.predict_proba(X)
pred_probs.shape
Out[249]:
In [250]:
priorities.head()
Out[250]:
In [251]:
# Actual Churn
priorities['Churn_Act'] = y.copy()
# Predicted Churn
priorities["Churn_Pred"] = rf.predict(X)
# Predicted Probability of No Churn
priorities['No_Churn_Prob'] = pred_probs[:,0]
# Predicted Probability of Churn
priorities['Churn_Prob'] = pred_probs[:,1]
In [252]:
priorities.head()
Out[252]:
In [253]:
# Expected Loss (Predicted Probability of Churn * Customer_LTV)
priorities['Expected_Loss'] = priorities['Customer_LTV'] * priorities['Churn_Prob']
In [254]:
# Top 20 Customers by Expected Loss
priorities[priorities.Expected_Loss > 0].sort_values(by="Expected_Loss", ascending=False)[:10]
Out[254]:
In [255]:
priorities[priorities.Churn_Act == 1].sum()
Out[255]:
In [256]:
# (False Positives) Customers Predicted to Churn but Haven't... yet
priorities[(priorities.Churn_Act == 0) & (priorities.Churn_Pred == 1)].sort_values(by="Expected_Loss", ascending=False)
Out[256]:
In [257]:
priorities[(priorities.Churn_Act == 0) & (priorities.Churn_Pred == 1)]['Customer_LTV'].sum()
Out[257]:
In [258]:
priorities[(priorities.Churn_Act == 0) & (priorities.Churn_Pred == 1)]['Expected_Loss'].sum()
Out[258]:
In [271]:
# Contact information for churn candidates
customer_contact.iloc[[635,2801,3140]]
Out[271]:
In [272]:
# Churn candidate behavior information for use on call
churn.iloc[[635,2801,3140]]
Out[272]: