In [1]:
%pylab inline
In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
pd.set_option('display.max_columns', None)
In [3]:
churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist()
Columns
In [4]:
col_names
Out[4]:
In [5]:
churn_df.head()
Out[5]:
In [6]:
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)
# We don't need these columns
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)
# 'yes'/'no' has to be converted to boolean values
# NumPy converts these from boolean to 1. and 0. later
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
In [7]:
features = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)
# This is important
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
print "Feature space holds %d observations and %d features" % X.shape
print "Unique target labels:", np.unique(y)
In [8]:
kf = KFold(len(y),n_folds=5,shuffle=True)
y_pred = y.copy()
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train = y[train_index]
clf = RandomForestClassifier(n_jobs=3)
clf.fit(X_train,y_train)
y_pred[test_index] = clf.predict(X_test)
def accuracy(y_true,y_pred):
return np.mean(y_true == y_pred)
print accuracy(y, y_pred)
In [9]:
fet_imp = clf.feature_importances_
In [10]:
fig = plt.figure(figsize=(8,4))
ax = plt.subplot(111)
plt.bar(np.arange(len(fet_imp)), fet_imp, width=1, lw=2)
plt.grid(False)
ax.set_xticks(np.arange(len(fet_imp))+.5)
ax.set_xticklabels(churn_feat_space.columns)
plt.setp(ax.get_xticklabels(), rotation='vertical', fontsize=14)
plt.xlim(0, len(fet_imp))
Out[10]:
In [11]:
def draw_cfm(y_true,y_pred):
cm = confusion_matrix(y_true, y_pred)
plt.matshow(cm)
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.colorbar()
def cfm(y_true,y_pred):
return confusion_matrix(y_true, y_pred)
def report(y_true,y_pred):
return classification_report(y_true, y_pred)
print report(y, y_pred)
print cfm(y, y_pred)
draw_cfm(y, y_pred)