Get your data here. The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed. There are four datasets:
1) bank-additional-full.csv with all examples (41188) and 20 inputs, ordered by date (from May 2008 to November 2010)
2) bank-additional.csv with 10% of the examples (4119), randomly selected from 1), and 20 inputs.
3) bank-full.csv with all examples and 17 inputs, ordered by date (older version of this dataset with less inputs).
4) bank.csv with 10% of the examples and 17 inputs, randomly selected from 3 (older version of this dataset with less inputs).
The smallest datasets are provided to test more computationally demanding machine learning algorithms (e.g., SVM).
The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).
LabelEncoder useful)
In [48]:
## Life happened real hard this week so apologies for missing pieces and messes
In [109]:
#import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn import preprocessing
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn import svm, grid_search, datasets
%matplotlib inline
In [83]:
bank_full_df = pd.read_csv('bank/bank-full.csv', sep = ';')
In [84]:
bank_df = pd.read_csv('bank/bank.csv', sep = ';')
In [85]:
bank_additional_full_df = pd.read_csv('bank-additional/bank-additional-full.csv', sep = ';')
In [86]:
bank_additional_df = pd.read_csv('bank-additional/bank-additional.csv', sep = ';')
In [87]:
cat_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y']
In [88]:
le = preprocessing.LabelEncoder()
for col in cat_columns:
bank_additional_full_df[col] = le.fit_transform(bank_additional_full_df[col])
In [89]:
bank_additional_full_df.head()
Out[89]:
In [90]:
%%time
from sklearn.ensemble import RandomForestClassifier as RF
# Random Forest
def preprocess(df):
# (Feature - Target)
X = df.drop('y', axis = 1)
y = df.y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
# Run Model
clf = RF(n_estimators = 50)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# Results
print y_pred.shape
print clf.score(X_test, y_test)
print confusion_matrix(y_test, y_pred)
print classification_report(y_test, y_pred)
preprocess(bank_additional_full_df)
In [91]:
%%time
def preprocess(df):
# (Feature - Target)
X = df.drop('y', axis = 1)
y = df.y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
# Run Model
clf = KNN(n_neighbors = 5, algorithm = "kd_tree")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# Results
print y_pred.shape
print clf.score(X_test, y_test)
print confusion_matrix(y_test, y_pred)
print classification_report(y_test, y_pred)
preprocess(bank_additional_full_df)
In [92]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [93]:
%%time
def training_examples(df):
X = df.drop('y', axis = 1)
y = df.y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
#learning curves
plot_learning_curve(RF(n_estimators=10), 'test', X_train, y_train)
plot_learning_curve(RF(n_estimators=20), 'test', X_train, y_train)
plot_learning_curve(RF(n_estimators=50), 'test', X_train, y_train)
training_examples(bank_additional_full_df)
In [100]:
clf = RF(n_estimators=20)
clf.fit(X_train, y_train)
imp = sorted(zip(clf.feature_importances_, xcols), reverse=True)[:10]
imp
Out[100]:
In [102]:
#features as a list
X_import=[]
for i in range(len(imp)):
X_import.append(imp[i][1])
X_import
Out[102]:
In [107]:
#random forest with top 10
def forest(df):
#feature - target
xcols = [col for col in df.columns if col != 'y']
X = df[X_import].values
y = df['y'].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .3)
#run
parameters = {'n_estimators':[1,100], 'max_depth':[1,1000]}
random = RF()
clf = grid_search.GridSearchCV(random, parameters)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
#results
print confusion_matrix(y_test, y_pred)
print classification_report(y_test, y_pred)
forest(bank_data)
In [110]:
#knn with top 10
def knn(df):
X = df.drop('y', axis = 1)
y = df.y.values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .3)
parameters = {'n_neighbors':[1,100], 'weights':('distance', 'uniform'), 'algorithm':('auto','ball_tree', 'kd_tree', 'brute'), 'leaf_size':[1,100]}
knn = neighbors.KNeighborsClassifier()
clf = grid_search.GridSearchCV(knn, parameters)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print y_pred.shape
print clf.score(X_test, y_test)
print confusion_matrix(y_test, y_pred)
print classification_report(y_test, y_pred)
knn(bank_data)
In [94]:
%%time
def plot_KNN_recall(df):
re_scores = []
X = df.drop('y', axis = 1)
X = pd.get_dummies(X)
y = df.y.values
le = preprocessing.LabelEncoder()
le.fit(["yes", "no"])
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
def learning_curve(n):
for i in range(1, n):
clf = KNN(n_neighbors = i, algorithm = "kd_tree")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
re_score = metrics.recall_score(y_test, y_pred)
re_scores.append(re_score)
learning_curve(50)
pd.DataFrame(re_scores).plot()
plot_KNN_recall(bank_additional_df)
In [95]:
%%time
def plot_KNN_recall(df):
re_scores = []
X = df.drop('y', axis = 1)
X = pd.get_dummies(X)
y = df.y.values
le = preprocessing.LabelEncoder()
le.fit(["yes", "no"])
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
def learning_curve(n):
for i in range(1, n):
clf = KNN(n_neighbors = i, algorithm = "kd_tree")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
re_score = metrics.mean_squared_error(y_test, y_pred)
re_scores.append(re_score)
learning_curve(50)
pd.DataFrame(re_scores).plot()
plot_KNN_recall(bank_additional_df)
In [96]:
%%time
def plot_KNN_precision(df):
pre_scores = []
X = df.drop('y', axis = 1)
X = pd.get_dummies(X)
y = df.y.values
le = preprocessing.LabelEncoder()
le.fit(["yes", "no"])
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
def learning_curve(n):
for i in range(1, n):
clf = KNN(n_neighbors = i, algorithm = "kd_tree")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
pre_score = metrics.precision_score(y_test, y_pred)
pre_scores.append(pre_score)
learning_curve(50)
pd.DataFrame(pre_scores).plot()
plot_KNN_precision(bank_additional_df)
In [97]:
%%time
def plot_KNN_recall(df):
re_scores = []
X = df.drop('y', axis = 1)
X = pd.get_dummies(X)
y = df.y.values
le = preprocessing.LabelEncoder()
le.fit(["yes", "no"])
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
def learning_curve(n):
for i in range(1, n):
clf = KNN(n_neighbors = i, algorithm = "kd_tree")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
re_score = metrics.mean_squared_error(y_test, y_pred)
re_scores.append(re_score)
learning_curve(50)
pd.DataFrame(re_scores).plot()
plot_KNN_recall(bank_additional_df)
In [98]:
%%time
def plot_KNN_precision(df):
re_scores = []
X = df.drop('y', axis = 1)
X = pd.get_dummies(X)
y = df.y.values
le = preprocessing.LabelEncoder()
le.fit(["yes", "no"])
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
clf = KNN(n_neighbors = 42, algorithm = "kd_tree")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print classification_report(y_test, y_pred)
plot_KNN_precision(bank_additional_df)
In [99]:
%%time
def plot_KNN_recall(df):
re_scores = []
X = df.drop('y', axis = 1)
X = pd.get_dummies(X)
y = df.y.values
le = preprocessing.LabelEncoder()
le.fit(["yes", "no"])
y = le.transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
def learning_curve(n):
for i in range(1, n):
clf = KNN(n_neighbors = i, algorithm = "kd_tree")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
re_score = metrics.mean_squared_error(y_test, y_pred)
re_scores.append(re_score)
learning_curve(50)
pd.DataFrame(re_scores).plot()
plot_KNN_recall(bank_additional_df)
In [71]:
In [ ]: