Get your data here. The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed. There are four datasets:
1) bank-additional-full.csv with all examples (41188) and 20 inputs, ordered by date (from May 2008 to November 2010)
2) bank-additional.csv with 10% of the examples (4119), randomly selected from 1), and 20 inputs.
3) bank-full.csv with all examples and 17 inputs, ordered by date (older version of this dataset with less inputs).
4) bank.csv with 10% of the examples and 17 inputs, randomly selected from 3 (older version of this dataset with less inputs).
The smallest datasets are provided to test more computationally demanding machine learning algorithms (e.g., SVM).
The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).
LabelEncoder
useful)
In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
from sklearn.learning_curve import learning_curve
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cross_validation import cross_val_score,train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
In [7]:
df = pd.read_csv("../data/bank-additional-full.csv", sep = ';')
In [25]:
#convert decision column to bool
df['y'].replace('no', 0, inplace = True)
df['y'].replace('yes', 1, inplace = True)
df['y'].value_counts()
Out[25]:
In [29]:
dfdummy = pd.get_dummies(df)
In [46]:
x_train, x_test, y_train, y_test = train_test_split(dfdummy.drop('y', axis = 1), dfdummy['y'])
In [41]:
knn = KNeighborsClassifier()
rdf = RandomForestClassifier()
In [48]:
knn.fit(x_train, y_train)
rdf.fit(x_train, y_train)
Out[48]:
In [50]:
print cross_val_score(knn, x_train, y_train).mean()
In [51]:
print cross_val_score(rdf, x_train, y_train).mean()
In [58]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [75]:
%%time
#knn learning curve
num_neighbors = range(1, 50, 5)
knn_results = []
for x in num_neighbors:
knn = KNeighborsClassifier(n_neighbors = x)
plot_learning_curve(knn, x, x_train, y_train)
knn_results.append([cross_val_score(knn, x_train, y_train).mean(), x, knn])
In [77]:
%%time
num_estimators = range(1, 100, 5)
rdf_results = []
for x in num_estimators:
rdf = RandomForestClassifier(n_estimators = x)
plot_learning_curve(rdf, x, x_train, y_train)
rdf_results.append([cross_val_score(rdf, x_train, y_train).mean(), x, rdf])
In [87]:
#find the best classifier for knn and rdf by getting the one with the highest score
knn_results.sort(reverse = True)
best_knn = knn_results[0][2]
In [91]:
best_knn.fit(x_train, y_train)
y_pred = best_knn.predict(x_test)
confusion_matrix(y_test, y_pred)
Out[91]:
In [92]:
print classification_report(y_test, y_pred)
In [93]:
#do it again for rdf
rdf_results.sort(reverse = True)
best_rdf = rdf_results[0][2]
In [94]:
best_rdf.fit(x_train, y_train)
y_pred = best_rdf.predict(x_test)
confusion_matrix(y_test, y_pred)
Out[94]:
In [95]:
print classification_report(y_test, y_pred)
In [113]:
#I would use it to weight the features somehow
#I think this is the wrong way to get values zipped but I am not sure what the right way is
#The issue here is the shape of feature importances has 63 fields, but columns values has 64 because it has the target lable.
#So the below is wrong ish, but some of them might be right. The 'y' column is not actually labled that, and the rest
#may have an off by one error
sorted(zip(best_rdf.feature_importances_, dfdummy.drop('y', axis = 1).columns.values), reverse = True)
Out[113]:
In [124]:
#this doesnt impress me. Can I do better?
#Everything above satisfies the homework, the below is my testing
In [119]:
#is multicore faster, if yes, how much?
%%time
randtree = RandomForestClassifier(n_estimators = 91, n_jobs=-1)
rtree = RandomForestClassifier(n_estimators = 91)
In [122]:
%%time
randtree.fit(x_train, y_train)
Out[122]:
In [123]:
%%time
rtree.fit(x_train, y_train)
Out[123]:
In [125]:
#success. now for grid search
from sklearn.grid_search import GridSearchCV
In [140]:
param_grid = {
"n_estimators": [291, 391, 491],
"max_depth": [30,50,70,None],
}
gs = GridSearchCV(RandomForestClassifier(n_jobs = -1), param_grid)
In [142]:
%%time
gs.fit(x_train, y_train)
print gs.best_estimator_
print cross_val_score(gs.best_estimator_, x_train, y_train).mean()
In [143]:
#ok this isnt doing great. let's nuke some of those crummy features
features = sorted(zip(best_rdf.feature_importances_, dfdummy.drop('y', axis = 1).columns.values), reverse = True)
In [144]:
to_remove = [x[1] for x in features if x[0] < .01]
In [145]:
print to_remove
In [151]:
df_simple = dfdummy.drop(to_remove, axis =1)
In [152]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(df_simple.drop('y', axis = 1), df_simple['y'])
In [153]:
simple_rfc = RandomForestClassifier(n_estimators = 291, n_jobs = -1)
In [154]:
simple_rfc.fit(x_train2, y_train2)
Out[154]:
In [155]:
cross_val_score(simple_rfc, x_train, y_train).mean()
Out[155]:
In [161]:
#Ok lets try that again but with even fewer features
to_remove = [x[1] for x in features if x[0] < .02]
df_simple = dfdummy.drop(to_remove, axis = 1)
x_train2, x_test2, y_train2, y_test2 = train_test_split(df_simple.drop('y', axis = 1), df_simple['y'])
simple_rfc = RandomForestClassifier(n_estimators = 291, n_jobs = -1)
simple_rfc.fit(x_train2, y_train2)
cross_val_score(simple_rfc, x_train, y_train).mean()
Out[161]:
In [162]:
param_grid = {
"n_estimators": [291, 391, 491],
"max_features": ['sqrt','log2',None],
}
gs = GridSearchCV(RandomForestClassifier(n_jobs = -1), param_grid)
In [163]:
gs.fit(x_train,y_train)
Out[163]:
In [166]:
gs.best_estimator_.score(x_test, y_test)
Out[166]:
In [167]:
#cant being that .91 wall. let's try gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
In [170]:
%%time
gbc.fit(x_train, y_train)
Out[170]:
In [172]:
%%time
cross_val_score(gbc, x_train, y_train)
Out[172]:
In [174]:
from sklearn.svm import SVC
In [175]:
linsvm = SVC()
In [176]:
linsvm.fit(x_train, y_train)
Out[176]:
In [177]:
cross_val_score(linsvm, x_train, y_train)
Out[177]:
In [ ]: