Get your data here. The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed. There are four datasets:
1) bank-additional-full.csv with all examples (41188) and 20 inputs, ordered by date (from May 2008 to November 2010)
2) bank-additional.csv with 10% of the examples (4119), randomly selected from 1), and 20 inputs.
3) bank-full.csv with all examples and 17 inputs, ordered by date (older version of this dataset with less inputs).
4) bank.csv with 10% of the examples and 17 inputs, randomly selected from 3 (older version of this dataset with less inputs).
The smallest datasets are provided to test more computationally demanding machine learning algorithms (e.g., SVM).
The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).
In [2]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# This enables inline Plots
%matplotlib inline
# Limit the rows displayed in dataframe by inserting this line along with your imports.
pd.set_option('display.max_rows', 10)
In [3]:
# Create a data frame from the listings dataset
bank_additional = pd.read_csv('../hw3/bank-additional.csv', delimiter=";", header=0)
bank_additional_full = pd.read_csv('../hw3/bank-additional-full.csv', delimiter=";", header=0)
In [4]:
# Check the header of the file
print bank_additional_full.head()
In [5]:
# Check for missing values
print bank_additional_full.info()
In [6]:
# There are apparently no null values in the datasets so it is not necessary to impute any values
In [7]:
# Check the values in the target variable (y)
print bank_additional_full.y.value_counts()
In [8]:
# Import the label encoder package
import pandas
from sklearn import preprocessing
In [9]:
# Create the feature dataset using LabelEncoder for string variables
cols = list(bank_additional_full.columns)
cols.remove('y')
x = pd.DataFrame()
for col in cols:
if bank_additional_full[col].dtype <> 'object':
x[col] = bank_additional_full[col]
if bank_additional_full[col].dtype == 'object':
le = preprocessing.LabelEncoder()
x[col] = le.fit_transform(bank_additional_full[col])
In [10]:
# Create the target dataset
y = le.fit_transform(bank_additional_full.y)
In [11]:
# Split the data into test and training
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
In [12]:
# Import the KNeigghborsClassifier
from sklearn.neighbors import KNeighborsClassifier
# Instantiate the estimator
kn_model = KNeighborsClassifier()
# Fit the estimator to the training data
kn_model.fit(x_train, y_train)
# Print out the score
print kn_model.score(x_train, y_train)
In [13]:
# Import the DecisionTreeClassifier and RandomForestClassifier from scikit-learn
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# Use the RandomForestClassifier to fit the data
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
# Print out the score
print cross_val_score(rf_model, x_train, y_train).mean()
In [14]:
# Use the plotting functionality that was created
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, x, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [14]:
# Plot the learning curve using the training data
_ = plot_learning_curve(KNeighborsClassifier(n_neighbors=100), 'test', x_train, y_train)
In [15]:
# Plot the learning curve using the training data
_ = plot_learning_curve(RandomForestClassifier(n_estimators=100), 'test', x_train, y_train)
In [25]:
# Import GridSearchCV from scikit-learn
from sklearn.grid_search import GridSearchCV
# Establish the search space for parameters of n_estimators
param = {'n_neighbors':range(1,100)}
# Set up the grid search
gs = GridSearchCV(KNeighborsClassifier(),param)
# Run the grid search on our model
gs.fit(x_train, y_train)
Out[25]:
In [26]:
# Display the parameters and score for the best fitting model
gs.best_params_,gs.best_score_
Out[26]:
In [ ]:
# GridSearch suggests that the cross validation score is best when we use 99 neighbors
# In essence, the cross validation score continues to go up with the number of neighbors
In [16]:
# Import GridSearchCV from scikit-learn
from sklearn.grid_search import GridSearchCV
# Establish the search space for parameters of n_estimators
param = {'n_estimators':range(1,100)}
# Set up the grid search
gs = GridSearchCV(RandomForestClassifier(),param)
# Run the grid search on our model
gs.fit(x_train, y_train)
Out[16]:
In [17]:
# Display the parameters and score for the best fitting model
gs.best_params_,gs.best_score_
Out[17]:
In [18]:
# GridSearch suggests that the cross validation score is best when we use 66 estimators
# As we saw in the graph, there is a certain point when the score starts to decline
In [27]:
# Import the metrics package from scikit-learn
from sklearn import metrics
In [29]:
# Use the model to predict test data from KNeighborsClassifier
kn_model = KNeighborsClassifier(n_neighbors=99)
kn_model.fit(x_train, y_train)
y_pred = kn_model.predict(x_test)
# Run the classification report
print metrics.classification_report(y_test, y_pred)
In [30]:
# Print out the accuracy, precision, recall, etc.
print "accuracy:", metrics.accuracy_score(y_test, y_pred)
print "precision:", metrics.precision_score(y_test, y_pred)
print "recall:", metrics.recall_score(y_test, y_pred)
print "f1 score:", metrics.f1_score(y_test, y_pred)
In [31]:
# The accuracy of the model is 0.917 while the precision is 0.687 and the recall is 0.50
# This suggests that the model is better at making any decision (positives + negatives)
# than it is at correctly identifying the true positives
In [35]:
# Use the model to predict test data from RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=66)
rf_model.fit(x_train, y_train)
y_pred = rf_model.predict(x_test)
# Run the classification report
print metrics.classification_report(y_test, y_pred)
In [36]:
# Print out the accuracy, precision, recall, etc.
print "accuracy:", metrics.accuracy_score(y_test, y_pred)
print "precision:", metrics.precision_score(y_test, y_pred)
print "recall:", metrics.recall_score(y_test, y_pred)
print "f1 score:", metrics.f1_score(y_test, y_pred)
In [37]:
# The accuracy of the Random Forest model is 0.915 while the precision is 0.67 and the recall is 0.51
# As with the previou
In [39]:
# This prints the top 10 most important features with the RandomForestClassifier
rf_model.fit(x_train, y_train)
sorted(zip(rf_model.feature_importances_, cols), reverse=True)[:20]
Out[39]:
In [ ]:
# The most important features are listed above but the top-5 are significantly better predictors than the rest:
# duration
# euribor3m
# age
# nr.employed
# job
# Knowing this, it would be possible to improve model precision by weighting these features more heavily
# while ignoring the less predictive features.