Get your data here. The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed. There are four datasets:
1) bank-additional-full.csv with all examples (41188) and 20 inputs, ordered by date (from May 2008 to November 2010)
2) bank-additional.csv with 10% of the examples (4119), randomly selected from 1), and 20 inputs.
3) bank-full.csv with all examples and 17 inputs, ordered by date (older version of this dataset with less inputs).
4) bank.csv with 10% of the examples and 17 inputs, randomly selected from 3 (older version of this dataset with less inputs).
The smallest datasets are provided to test more computationally demanding machine learning algorithms (e.g., SVM).
The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).
LabelEncoder
useful)
In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.cross_validation as cv
import sys
import re
import os
import pprint
import random
from scipy import stats
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn import preprocessing
from collections import Counter
from datetime import datetime
from collections import Counter
from fuzzywuzzy import fuzz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.learning_curve import learning_curve
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)
np.set_printoptions(precision = 4)
%matplotlib inline
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__
print 'Numpy version ' + np.__version__
In [2]:
bank = pd.read_csv('./bank-additional/bank-additional-full.csv', delimiter = ';')
In [3]:
bank.head()
Out[3]:
In [4]:
bank.info()
In [5]:
# Missing values - none?! Maybe coded with ?. Let's check unique codings.
bank.isnull().sum()
Out[5]:
In [6]:
# Get column names.
colNames = bank.columns
In [7]:
# Print unique values per column. Missing values seem coded with 'unknown'.
for col in colNames:
print col, set(bank[col])
In [8]:
# Impute missing values.
col_dist = {}
def get_col_dist(col_name):
excl_null_mask = bank[col_name] != 'unknown'
row_count = bank[excl_null_mask][col_name].size
col_data = {}
col_data['prob'] = (bank[excl_null_mask][col_name].value_counts() / row_count).values
col_data['values'] = (bank[excl_null_mask][col_name].value_counts() / row_count).index.values
return col_data
In [9]:
col_dist['job'] = get_col_dist('job')
col_dist['marital'] = get_col_dist('marital')
col_dist['education'] = get_col_dist('education')
col_dist['default'] = get_col_dist('default')
col_dist['housing'] = get_col_dist('housing')
col_dist['loan'] = get_col_dist('loan')
In [10]:
print col_dist
In [11]:
def impute_cols(val, options):
if val == 'unknown':
return np.random.choice(options['values'], p=options['prob'])
return val
In [12]:
def impute_job(val):
return impute_cols(val, col_dist['job'])
def impute_marital(val):
return impute_cols(val, col_dist['marital'])
def impute_edu(val):
return impute_cols(val, col_dist['education'])
def impute_default(val):
return impute_cols(val, col_dist['default'])
def impute_housing(val):
return impute_cols(val, col_dist['housing'])
def impute_loan(val):
return impute_cols(val, col_dist['loan'])
In [13]:
bank.job = bank.job.map(impute_job)
bank.marital = bank.marital.map(impute_marital)
bank.education = bank.education.map(impute_edu)
bank.default = bank.default.map(impute_default)
bank.housing = bank.housing.map(impute_housing)
bank.loan = bank.loan.map(impute_loan)
In [14]:
bank.head()
Out[14]:
In [15]:
# Numeric features.
numFeats = ['age', 'duration', 'campaign', 'pdays', 'previous', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
# Numeric features without pdays (strongly related to target).
numFeatsR = ['age', 'duration', 'campaign', 'previous', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
# Some descriptives.
bank[numFeats].describe()
Out[15]:
In [16]:
# Feature pdays looks weird. Checked documentation: 999 coded if customer contacted first time.
# Recode into bins stating whether and when someone was contacted before (e.g., never, same day etc.)
In [18]:
# Create artificial bins for pdays.
def recode_pdays(val):
if val == 999:
return 'never contacted'
elif val == 0:
return 'same day'
elif 1 <= val <= 7:
return 'within 1 week'
elif 8<= val <= 14:
return 'between 1 and 2 weeks'
elif 15 <= val <= 21:
return 'between 2 and 3 weeks'
else:
return 'more than 3 weeks'
# Recode.
bank['pdays_cat'] = bank.pdays.map(recode_pdays)
# Drop pdays.
bank.drop('pdays', axis = 1, inplace = True)
In [19]:
# Preprocessing - label encoder.
# Categorical features.
catFeats = ['y', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome',
'pdays_cat']
# Categorical features without target.
catFeatsR = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome',
'pdays_cat']
In [20]:
#le = preprocessing.LabelEncoder()
# This doesn't work, need one labelencoder for each feature!!!
#for cat in catFeats:
# bank[cat] = le.fit_transform(bank[cat])
label_encoders = {}
for cat in catFeats:
label_encoders[cat] = preprocessing.LabelEncoder()
bank[cat] = label_encoders[cat].fit_transform(bank[cat])
In [18]:
bank.head()
Out[18]:
In [24]:
# Calls by month. Some months see more calls than others.
plot = bank['month'].value_counts().order(ascending = False).plot(kind = 'bar', figsize = (10, 10), color = 'r')
plot.set_title('Last contact with customer by month')
plot.set_xlabel('Month')
plot.set_ylabel('Number of customers contacted');
In [25]:
# Calls by day of week: no apparent difference in call volume by day of week.
plot = bank['day_of_week'].value_counts().order(ascending = False).plot(kind = 'bar', figsize = (10, 10), color = 'grey')
plot.set_title('Last customer contact by day of week')
plot.set_xlabel('Day of week')
plot.set_ylabel('Number of customers')
Out[25]:
In [26]:
bank.age.hist(bins = 10) # Age a bit scewed, which makes sense, as more ppl probably apply for loans when they're younger.
Out[26]:
In [27]:
bank.duration.hist(bins = 10) # That is interesting - call duration is skewed and supershort for most calls.
print 'Median duration of call:', bank.duration.median() # Median is 3 minutes!
In [28]:
print 'Spearman correlation age, call duration', stats.spearmanr(bank.age, bank.duration) # No assoc. betw. age, call dur.
In [29]:
bank.campaign.hist(bins = 10) # Lmtd. number of contact.
print bank.campaign.median() # Median number of contacts is 2.
In [30]:
print 'Spearman corr age, campaign', stats.spearmanr(bank.age, bank.campaign)
print 'Spearman corr duration, campaign', stats.spearmanr(bank.duration, bank.campaign)
In [31]:
# Scatter plot of call duration, campaign.
plt.scatter(bank.duration, bank.campaign)
Out[31]:
In [32]:
# Correlations between numeric non-binary features. Exclude pdays, as 999 could bias.
# Some look significant, double-check them.
bank[numFeatsR].corr(method = 'spearman')
Out[32]:
In [33]:
print 'p value of Spearman correlation between'
print 'age, consumer confidence idx:', stats.spearmanr(bank['age'], bank['cons.conf.idx']) [1]
print 'campaign, number of employees:', stats.spearmanr(bank.campaign, bank['nr.employed'])[1]
print 'number of previous contacts, consumer confidence idx:', stats.spearmanr(bank.previous, bank['cons.conf.idx'])[1]
print 'number of previous contacts, consumer price idx:', stats.spearmanr(bank.previous, bank['cons.price.idx'])[1]
print 'number of previous contacts, Euribor rate:', stats.spearmanr(bank.previous, bank['euribor3m'])[1]
print 'number of previous contacts, number of employees:', stats.spearmanr(bank.previous, bank['nr.employed'])[1]
print 'consumer price idx, consumer confidence idx:', stats.spearmanr(bank['cons.price.idx'], bank['cons.conf.idx'])[1]
print 'consumer price idx, number of employees:', stats.spearmanr(bank['cons.price.idx'], bank['nr.employed'])[1]
print 'Euribor, consumer confidence index:', stats.spearmanr(bank.euribor3m, bank['cons.conf.idx'])[1]
print 'Euribor, number of employees:', stats.spearmanr(bank.euribor3m, bank['nr.employed'])[1]
In [34]:
# Correlations by target: Differ for some features.
bank[bank.y == 0][numFeatsR].corr(method = 'spearman')
Out[34]:
In [35]:
bank[bank.y == 1][numFeatsR].corr(method = 'spearman')
Out[35]:
In [36]:
# Factorplot by target.
import seaborn as sns
sns.factorplot('y', data = bank, palette = 'Greens') # Mostly 0 as target! Classification maybe biased?
Out[36]:
In [37]:
sns.factorplot('y', data = bank, palette = 'Blues', hue = 'loan')
Out[37]:
In [38]:
sns.factorplot('y', data = bank, palette = 'Reds', hue = 'default')
Out[38]:
In [39]:
sns.factorplot('y', data = bank, palette = 'Greens', hue = 'marital')
Out[39]:
In [40]:
# Get target variable.
target = bank['y']
In [41]:
target.shape
Out[41]:
In [42]:
# Drop target.
# Drop duration, documentation suggests.
bank.drop(['y', 'duration'], axis = 1, inplace = True)
In [43]:
features = bank.as_matrix()
In [44]:
features.shape
Out[44]:
In [45]:
# Split.
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 12)
In [46]:
knn = KNeighborsClassifier()
In [47]:
# Grid Search for KNN parameters to be used.
param_knn = {'n_neighbors' : np.arange(10, 101, 5)}
knn_gs = GridSearchCV(knn, param_grid = param_knn)
knn_gs.fit(X_train, y_train)
print 'Best parameters:', knn_gs.best_params_
In [48]:
# Run knn with best parameters.
knn = KNeighborsClassifier(n_neighbors = 50)
knn.fit(X_train, y_train)
Out[48]:
In [49]:
# Score.
knn.score(X_test, y_test)
Out[49]:
In [50]:
# Predicted y.
y_pred_knn = knn.predict(X_test)
In [51]:
# Confusion matrix.
print 'Confusion matrix:\n', confusion_matrix(y_test, y_pred_knn)
In [52]:
# Plot confusion matrix.
plt.matshow(confusion_matrix(y_test, y_pred_knn))
plt.title('KNN Confusion matrix')
plt.colorbar()
plt.ylabel('True')
plt.xlabel('Predicted')
Out[52]:
In [53]:
# Classification report/
print 'classification report:\n', classification_report(y_test, y_pred_knn) # Works well for 0 target, not so for 1 target.
In [54]:
# Learning curve. Thanks to Chad for the code.
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [55]:
# Plot learning curve to visualize bias variance trade-off in terms of training samples.
plot_learning_curve(KNeighborsClassifier(n_neighbors = 50), 'KNN', X_train, y_train, ylim=None, cv=None, n_jobs=-1)
Out[55]:
In [56]:
# Another plot. This time visualizing best parameter in terms of n_neighbors.
train_scores = []
test_scores = []
ks = range(10, 101, 5)
for i in ks:
knn = KNeighborsClassifier(n_neighbors = i)
knn.fit(X_train, y_train)
train_scores.append(knn.score(X_train, y_train))
test_scores.append(knn.score(X_test, y_test))
plt.plot(ks, train_scores, label="training scores")
plt.legend(loc="best")
plt.tight_layout()
plt.plot(ks, test_scores, label="validation scores")
plt.legend(loc="best")
Out[56]:
In [57]:
# Look at probabilities.
bank_y_pred_df = pd.DataFrame(knn.predict_proba(X_test))
bank_y_pred_df['Predicted'] = y_pred_knn
bank_y_pred_df['True'] = y_test
# Show only FP/ FN examples.
print 'FP, FN examples:\n', bank_y_pred_df[bank_y_pred_df['Predicted'] != bank_y_pred_df['True']].head(20)
print 'Total FP, FN:\n', len(bank_y_pred_df[bank_y_pred_df['Predicted'] != bank_y_pred_df['True']])
In [58]:
# Matthews coeff is measure of quality of binary classification. Not very good. Maybe because of unequal target sizes?
matthews_corrcoef(y_test, y_pred_knn)
Out[58]:
In [59]:
# Random forest.
rf = RandomForestClassifier(n_jobs = -1)
In [60]:
# Run GridSearch for best parameters.
param_rf = {'n_estimators': np.arange(10, 101, 10), 'criterion': ['gini', 'entropy']}
In [61]:
rf_gs = GridSearchCV(rf, param_grid=param_rf)
In [62]:
rf_gs.fit(features, target)
Out[62]:
In [63]:
rf_gs.grid_scores_
Out[63]:
In [64]:
# Use these in rf model.
rf_gs.best_params_
Out[64]:
In [65]:
rf = RandomForestClassifier(n_jobs=-1, n_estimators=30, criterion='entropy')
In [66]:
rf.fit(features, target)
Out[66]:
In [67]:
rf.feature_importances_
Out[67]:
In [68]:
# 10 Most important features. Could build reduced model with just these.
print 'Most important features:\n', sorted(zip(rf.feature_importances_, bank.columns), reverse = True)[:10]
In [69]:
# Confusion matrix
y_pred_rf = rf.predict(features)
In [70]:
print 'cf matrix:\n', confusion_matrix(target, y_pred_rf)
In [71]:
# Plot confusion matrix.
plt.matshow(confusion_matrix(target, y_pred_rf))
plt.title('RF Confusion matrix')
plt.colorbar()
plt.ylabel('True')
plt.xlabel('Predicted')
Out[71]:
In [72]:
# Classification report.
print 'classification report:\n', classification_report(target, y_pred_rf) # Works well for both.
In [73]:
# # Look at probabilities.
bank_y_pred_df_rf = pd.DataFrame(rf.predict_proba(features))
bank_y_pred_df_rf['Predicted'] = y_pred_rf
bank_y_pred_df_rf['True'] = target
# Show only FP/ FN
print 'FP, FN examples:\n', bank_y_pred_df_rf[bank_y_pred_df_rf['Predicted'] != bank_y_pred_df_rf['True']].head(20)
print 'Total FP, Fn:\n', len(bank_y_pred_df_rf[bank_y_pred_df_rf['Predicted'] != bank_y_pred_df_rf['True']])
In [75]:
# Learning curve. So, more training samples actually brings the Cv score down ...
plot_learning_curve(RandomForestClassifier(n_jobs=-1, n_estimators=30, criterion='entropy'),
'RF', features, target)
Out[75]:
In [76]:
# Matthews coeff is measure of quality of binary classification. Pretty good.
matthews_corrcoef(target, y_pred_rf)
Out[76]:
In [78]:
# Plot visualizing best params in terms of N-estimators: gini first.
train_scores = []
test_scores = []
ks = range(10, 101, 10)
for i in ks:
knn = RandomForestClassifier(n_estimators = i, criterion = 'gini')
knn.fit(X_train, y_train)
train_scores.append(knn.score(X_train, y_train))
test_scores.append(knn.score(X_test, y_test))
plt.plot(ks, train_scores, label="training scores")
plt.legend(loc="best")
plt.tight_layout()
plt.plot(ks, test_scores, label="validation scores")
plt.legend(loc="best")
Out[78]:
In [79]:
# Plot visualizing best params in terms of N-estimators: entropy.
train_scores = []
test_scores = []
ks = range(10, 101, 10)
for i in ks:
knn = RandomForestClassifier(n_estimators = i, criterion = 'entropy')
knn.fit(X_train, y_train)
train_scores.append(knn.score(X_train, y_train))
test_scores.append(knn.score(X_test, y_test))
plt.plot(ks, train_scores, label="training scores")
plt.legend(loc="best")
plt.tight_layout()
plt.plot(ks, test_scores, label="validation scores")
plt.legend(loc="best")
Out[79]:
In [ ]: