Get your data here. The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed. There are four datasets:
1) bank-additional-full.csv with all examples (41188) and 20 inputs, ordered by date (from May 2008 to November 2010)
2) bank-additional.csv with 10% of the examples (4119), randomly selected from 1), and 20 inputs.
3) bank-full.csv with all examples and 17 inputs, ordered by date (older version of this dataset with less inputs).
4) bank.csv with 10% of the examples and 17 inputs, randomly selected from 3 (older version of this dataset with less inputs).
The smallest datasets are provided to test more computationally demanding machine learning algorithms (e.g., SVM).
The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).
LabelEncoder
useful)
In [2]:
# Download data:
! wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip"
! wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
%matplotlib inline
In [2]:
ldata = pd.read_csv("./bank-additional/bank-additional-full.csv", sep=';') # ldata means large data
sdata = pd.read_csv("./bank-additional/bank-additional.csv", sep=';') # sdata means... yup, small.
In [3]:
sdata.head()
Out[3]:
In [4]:
sdata.describe()
Out[4]:
In [5]:
test = sdata.copy()
test.drop('y', axis=1)
Out[5]:
In [6]:
# 1. Preprocessing !
# Instead of label encoding, I'm just going to use get_dummies.
# Then we'll do some scaling of the numerical (non-categorical) values
def dummies_and_scale(df):
# separate target
target = df['y']
target.replace('no', '0', inplace=True)
target.replace('yes', '1', inplace=True)
df = df.drop('y', axis=1)
# scale numerical values
numeric_cols = df[df.describe().columns]
scaled_cols = pd.DataFrame(preprocessing.scale(numeric_cols), index = df.index, columns = df.describe().columns)
df[df.describe().columns] = scaled_cols
dummies = pd.get_dummies(df)
return dummies, target
In [7]:
data, target = dummies_and_scale(sdata)
In [8]:
target.value_counts()
Out[8]:
In [9]:
sdata['y'].value_counts()
Out[9]:
In [10]:
sdata.describe()
Out[10]:
In [11]:
data.describe()
Out[11]:
In [12]:
# Just double checking: std is close to 0 and mean is basically 0 (except for dummies obviously).
In [13]:
from sklearn.learning_curve import learning_curve
from sklearn import cross_validation
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 10)):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [14]:
# Above from: http://scikit-learn.org/stable/auto_examples/plot_learning_curve.html
In [15]:
cv = cross_validation.ShuffleSplit(data.shape[0], n_iter=2,
test_size=0.2, random_state=0)
estimator=KNeighborsClassifier()
plot_learning_curve(estimator, "KNN", data, target, cv=cv, n_jobs=4)
plt.show()
In [16]:
cv = cross_validation.ShuffleSplit(data.shape[0], n_iter=2,
test_size=0.2, random_state=0)
estimator=RandomForestClassifier()
plot_learning_curve(estimator, "Random Forest", data, target, cv=cv, n_jobs=4)
plt.show()
In [17]:
# Let's try again with a different cross validation value:
cv = 2
estimator=KNeighborsClassifier()
plot_learning_curve(estimator, "KNN", data, target, cv=cv, n_jobs=4)
plt.show()
In [20]:
# That is looking better ! I'll try to random forest too...
cv = 2
estimator=RandomForestClassifier()
plot_learning_curve(estimator, "Random Forest", data, target, cv=cv)
plt.show()
In [21]:
# Ok, so after running the larger set as well I've determined there may be features that are too important.
In [22]:
from sklearn.cross_validation import train_test_split
In [25]:
x_train, x_test, y_train, y_test = train_test_split?
In [26]:
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=.2)
In [27]:
estimator = RandomForestClassifier()
estimator.fit(x_train, y_train)
y_pred = estimator.predict(x_test)
In [28]:
from sklearn.metrics import confusion_matrix
In [29]:
confusion_matrix(y_pred, y_test)
Out[29]:
In [31]:
estimator.feature_importances_.max()
Out[31]:
In [32]:
estimator.feature_importances_.sum()
Out[32]:
In [ ]:
# So for random forests, the most important feature is only 23%
In [33]:
# Let's try the KNN too
estimator = KNeighborsClassifier()
estimator.fit(x_train, y_train)
y_pred = estimator.predict(x_test)
confusion_matrix(y_pred, y_test)
Out[33]:
In [36]:
# So Random forests is better, but only slightly
In [38]:
from sklearn.grid_search import GridSearchCV as gs
rf = RandomForestClassifier()
rf_paramters = {'n_estimators' : range(10,50,10)}
In [40]:
rf.fit(x_train, y_train)
rfg = gs(rf, rf_paramters)
rfg.fit(x_train, y_train)
Out[40]:
In [41]:
rfg.best_params_
Out[41]:
In [42]:
rf = RandomForestClassifier()
rf_paramters = {'n_estimators' : range(25,50,1)}
rf.fit(x_train, y_train)
rfg = gs(rf, rf_paramters)
rfg.fit(x_train, y_train)
rfg.best_params_
Out[42]:
In [43]:
rfg.best_score_
Out[43]:
In [44]:
# That's seems too good ?
In [45]:
from sklearn.metrics import classification_report
In [46]:
classification_report?
In [47]:
classification_report(y_test, y_pred)
Out[47]:
In [48]:
import pprint
In [49]:
print classification_report(y_test, y_pred)
In [50]:
rf = RandomForestClassifier(n_estimators=32).fit(x_train, y_train)
y_pred = rf.predict(x_test)
print confusion_matrix(y_test, y_pred)
In [51]:
print classification_report(y_test, y_pred)
In [56]:
features = zip(rf.feature_importances_, sdata.columns)
In [61]:
features.sort(reverse=True)
In [62]:
features
Out[62]:
In [69]:
seven = [
(0.23787866999090784, 'job'),
(0.092757495516714999, 'month'),
(0.074543934592155192, 'age'),
(0.058628621282640972, 'day_of_week'),
(0.032383812326831657, 'marital'),
(0.03207739641329347, 'contact'),
(0.030651974427814175, 'education')
]
In [72]:
f7 = [i for i,j in seven]
In [74]:
sum(f7)
Out[74]:
In [ ]:
# So the top 7 cover about 56% of the importances...
In [1]:
ten = [
(0.23787866999090784, 'job'),
(0.092757495516714999, 'month'),
(0.074543934592155192, 'age'),
(0.058628621282640972, 'day_of_week'),
(0.032383812326831657, 'marital'),
(0.03207739641329347, 'contact'),
(0.030651974427814175, 'education'),
(0.028920355419687741, 'housing'),
(0.027574574885258604, 'loan'),
(0.015489395304804142, 'default'),
(0.014177318492882254, 'duration')
]
f10 = [i for i,j in ten]
sum(f10)
Out[1]:
In [3]:
# So ten features is another ~10% or so... it seems hard to know how many I would want to actually drop ?
all_features = [i for i,j in [(0.23787866999090784, 'job'),
(0.092757495516714999, 'month'),
(0.074543934592155192, 'age'),
(0.058628621282640972, 'day_of_week'),
(0.032383812326831657, 'marital'),
(0.03207739641329347, 'contact'),
(0.030651974427814175, 'education'),
(0.028920355419687741, 'housing'),
(0.027574574885258604, 'loan'),
(0.015489395304804142, 'default'),
(0.014177318492882254, 'duration'),
(0.0098731345417975187, 'nr.employed'),
(0.0082567916457971686, 'campaign'),
(0.0072101629135299236, 'emp.var.rate'),
(0.0070700707499528944, 'poutcome'),
(0.0054013289400914599, 'cons.conf.idx'),
(0.0043193754372234001, 'y'),
(0.0037179746568354031, 'euribor3m'),
(0.0033291093715349398, 'cons.price.idx'),
(0.0030211469603017029, 'pdays'),
(0.00207814438331417, 'previous')]]
sum(all_features)
Out[3]:
In [4]:
# So all of them max out at .699
In [75]:
# How might we use certain features? It seems that if we can reduce the complexity of the model, it would be good.
# Yet, all those little features add up...
# One idea is to do more grid search on the variables that matter most,
# then see how close you can get to the whole feature list
# Basically, if we can simplify the model by getting rid of irrelevant features, that is ideal.
In [ ]: