In [368]:
# Data Wrangling & Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
%matplotlib inline

# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Feature Selection
from sklearn.feature_selection import RFE

# Modeling
from sklearn.linear_model import LogisticRegressionCV, LassoCV
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

# Parameter tuning
from sklearn import grid_search

# Validation
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import confusion_matrix, classification_report

Predicting Telecom Customer Churn

  1. Data Collection & Processing
  2. Exploratory Data Analysis
  3. Feature Engineering
  4. Feature Selection Using Lasso and Logistic Regression + Random Forest
  5. Modeling
    5.1 Initial Results
    5.2 Tuning (Grid Search)
  6. Validation
  7. Churn Measurements
    7.1 Losses, Expected Losses, high probability churn candidates

Data Collection & Processing


In [362]:
# Download dataset
# ! wget https://raw.githubusercontent.com/EricChiang/churn/master/data/churn.csv
# ! mkdir data
# ! mv churn.csv ./data/churn.csv

In [260]:
# Creating churn dataframe
churn = pd.read_csv('data/churn.csv')

In [261]:
# Column name cleanup
churn.rename(columns = {'Churn?': 'Churn'}, inplace=True)
churn.rename(columns = {"Int'l Plan":"Intl Plan"}, inplace=True)

In [262]:
churn.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3333 entries, 0 to 3332
Data columns (total 21 columns):
State             3333 non-null object
Account Length    3333 non-null int64
Area Code         3333 non-null int64
Phone             3333 non-null object
Intl Plan         3333 non-null object
VMail Plan        3333 non-null object
VMail Message     3333 non-null int64
Day Mins          3333 non-null float64
Day Calls         3333 non-null int64
Day Charge        3333 non-null float64
Eve Mins          3333 non-null float64
Eve Calls         3333 non-null int64
Eve Charge        3333 non-null float64
Night Mins        3333 non-null float64
Night Calls       3333 non-null int64
Night Charge      3333 non-null float64
Intl Mins         3333 non-null float64
Intl Calls        3333 non-null int64
Intl Charge       3333 non-null float64
CustServ Calls    3333 non-null int64
Churn             3333 non-null object
dtypes: float64(8), int64(8), object(5)
memory usage: 572.9+ KB

In [263]:
churn.head(2)


Out[263]:
State Account Length Area Code Phone Intl Plan VMail Plan VMail Message Day Mins Day Calls Day Charge ... Eve Calls Eve Charge Night Mins Night Calls Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn
0 KS 128 415 382-4657 no yes 25 265.1 110 45.07 ... 99 16.78 244.7 91 11.01 10.0 3 2.7 1 False.
1 OH 107 415 371-7191 no yes 26 161.6 123 27.47 ... 103 16.62 254.4 103 11.45 13.7 3 3.7 1 False.

2 rows × 21 columns


In [264]:
# Binarizing all columns at once
churn.replace('no', 0, inplace=True)
churn.replace('yes', 1, inplace=True)
churn.replace('False.', 0, inplace=True)
churn.replace('True.', 1, inplace=True)

In [265]:
X_exploratory = churn.copy()
# Dropping customer contact columns
customer_contact = churn[['Area Code', 'Phone', 'State']]

In [266]:
# Encoding State into numerical representation
state_encoder = LabelEncoder()
churn.State = state_encoder.fit_transform(churn.State)

In [267]:
# Dropping customer contact columns
drop_cols = ['Area Code', 'Phone', 'State']
churn.drop(drop_cols, axis=1, inplace=True)

In [268]:
# Setting up for modeling
y = churn.Churn
X = churn.drop("Churn", axis=1, inplace=True)
X = churn
columns = X.columns

In [269]:
X.shape


Out[269]:
(3333, 17)

In [270]:
y.shape


Out[270]:
(3333,)

Exploratory Data Analysis


In [392]:
# Class Imbalance
X_exploratory.groupby('Churn')['Churn'].count()


Out[392]:
Churn
0    2850
1     483
Name: Churn, dtype: int64

In [403]:
X_exploratory.groupby('Churn')['Churn'].count().plot(kind='bar')
plt.title('Churn Class Imbalance')
sns.despine();



In [369]:
# No extreme churn distribution shift by account length; no outliers
X_exploratory.boxplot(column='Account Length', by='Churn');



In [408]:
# Normal Distribution for Account Lengths
sns.violinplot(x=X_exploratory.Churn, y=X_exploratory['Account Length'])
plt.title('Account Length by Churn')
sns.despine();



In [371]:
# Day Mins is definitely predictive; No outliers to consider filtering
X_exploratory.boxplot(column="Day Mins", by="Churn");



In [409]:
# Interestingly bimodal distribution on churn customers; they either talked a lot or not much
sns.violinplot(x=X_exploratory.Churn, y=X_exploratory['Day Mins'])
plt.title('Day Mins by Churn')
sns.despine();



In [370]:
churn2 = X_exploratory.groupby(['State','Churn'])['Churn'].count().unstack()
churn2.plot(kind='barh',stacked=True, figsize=(16,12), title='Customers Churned By State')
sns.despine();



In [415]:
churn_subset1 = X_exploratory[['Account Length', 'Intl Plan', 'VMail Plan', 'CustServ Calls', 'Churn']]

In [412]:
churn_subset2 = X_exploratory[['Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', \
                       'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', \
                      'Intl Calls', 'Intl Charge', 'Churn']]

In [416]:
sns.pairplot(churn_subset1, hue='Churn');



In [413]:
sns.pairplot(churn_subset2, hue='Churn');