In [2]:
# %load nbinit.py
from IPython.display import HTML
HTML("""
<style>
.container { width: 100% !important; padding-left: 1em; padding-right: 2em; }
div.output_stderr { background: #FFA; }
</style>
""")
Out[2]:
HW6.ipynb
and save it in MSA8010F16/HW6
We use a data set from the UCI Machine Learning Repository https://archive.ics.uci.edu/ml/datasets/Bank+Marketing to experiment with a Decision Tree classifier http://www.saedsayad.com/decision_tree.htm
Scikit-Learn: http://scikit-learn.org/stable/modules/tree.html#tree
Book slides:
The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).
Data Set Information:
The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed.
There are four datasets: 1) bank-additional-full.csv with all examples (41188) and 20 inputs, ordered by date (from May 2008 to November 2010), very close to the data analyzed in [Moro et al., 2014] 2) bank-additional.csv with 10% of the examples (4119), randomly selected from 1), and 20 inputs. 3) bank-full.csv with all examples and 17 inputs, ordered by date (older version of this dataset with less inputs). 4) bank.csv with 10% of the examples and 17 inputs, randomly selected from 3 (older version of this dataset with less inputs). The smallest datasets are provided to test more computationally demanding machine learning algorithms (e.g., SVM).
The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).
Attribute Information:
Input variables:
1 age (numeric)
2 job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
3 marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
4 education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
5 default: has credit in default? (categorical: 'no','yes','unknown')
6 housing: has housing loan? (categorical: 'no','yes','unknown')
7 loan: has personal loan? (categorical: 'no','yes','unknown')
8 contact: contact communication type (categorical: 'cellular','telephone')
9 month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
10 day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
11 duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
12 campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
13 pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
14 previous: number of contacts performed before this campaign and for this client (numeric)
15 poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
16 emp.var.rate: employment variation rate - quarterly indicator (numeric)
17 cons.price.idx: consumer price index - monthly indicator (numeric)
18 cons.conf.idx: consumer confidence index - monthly indicator (numeric)
19 euribor3m: euribor 3 month rate - daily indicator (numeric)
20 nr.employed: number of employees - quarterly indicator (numeric)
Output variable (desired target): 21 y - has the client subscribed a term deposit? (binary: 'yes','no')
In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [4]:
DATAFILE = '/home/data/archive.ics.uci.edu/BankMarketing/bank.csv'
###DATAFILE = 'data/bank.csv' ### using locally
In [5]:
df = pd.read_csv(DATAFILE, sep=';')
list(df.columns)
Out[5]:
Let's first look at columns (i.e. variables) with continuous values. We can get a sense of the distribution from aggregate functions like mean, standard variation, quantiles, as well as, minimum and maximum values.
The Pandas method describe
creates a table view of those metrics. (The methods can also be used to identify numeric features in the data frame.
In [6]:
### use sets and '-' difference operation 'A-B'. Also there is a symmetric different '^'
all_features = set(df.columns)-set(['y'])
num_features = set(df.describe().columns)
cat_features = all_features-num_features
print("All features: ", ", ".join(all_features), "\nNumerical features: ", ", ".join(num_features), "\nCategorical features: ", ", ".join(cat_features))
In [26]:
set(df.columns)-set(df.describe().columns)-set('y')
Out[26]:
In [7]:
### Describe Columns
help(pd.DataFrame.describe)
In [8]:
### Let's get the description of the numeric data for each of the target values separately.
### We need to rename the columns before we can properly join the tables. The column names may look strange...
desc_yes = df[df.y=='yes'].describe().rename_axis(lambda c: "%s|A"%c, axis='columns')
desc_no = df[df.y=='no'].describe().rename_axis(lambda c: "%s|B"%c, axis='columns')
In [ ]:
### ...but this way we can get them in the desired order...
desc = desc_yes.join(desc_no).reindex_axis(sorted(desc_yes.columns), axis=1)
### ...because we're changing them anyway:
In [12]:
#desc.set_axis(1, [sorted(list(num_features)*2), ['yes', 'no']*len(num_features)])
#desc
Let's look at the distribution of numerical features...
In [13]:
%matplotlib inline
fig = plt.figure(figsize=(32, 8))
for i in range(len(num_features)):
f = list(num_features)[i]
plt.subplot(2, 4, i+1)
hst = plt.hist(df[f], alpha=0.5)
plt.title(f)
plt.suptitle('Distribution of Numeric Values', fontsize=20)
None
Now, let's look at the categorical variables and their distribution...
In [14]:
for f in cat_features:
tab = df[f].value_counts()
print('%s:\t%s' % (f, ', '.join([ ("%s(%d)" %(tab.index[i], tab.values[i])) for i in range(len(tab))]) ))
Results in a data frame:
In [15]:
mat = pd.DataFrame(
[ df[f].value_counts() for f in list(cat_features) ],
index=list(cat_features)
).stack()
pd.DataFrame(mat.values, index=mat.index)
Out[15]:
In [ ]:
The ML algorithms in Scikit-Learn use Matrices (with numeric values). We need to convert our data-frame into a feature matrix X
and a target vector y
.
Many algorithms also require the features to be in the same range. Decision-trees don't bother because they don't perform any operations across features.
Use the pd.DataFrame.as_matrix
method to convert a DataFrame into a matrix.
In [ ]:
help(pd.DataFrame.as_matrix)
In [16]:
## We copy our original dataframe into a new one, and then perform replacements on categorical levels.
## We may also keep track of our replacement
level_substitution = {}
def levels2index(levels):
dct = {}
for i in range(len(levels)):
dct[levels[i]] = i
return dct
df_num = df.copy()
for c in cat_features:
level_substitution[c] = levels2index(df[c].unique())
df_num[c].replace(level_substitution[c], inplace=True)
## same for target
df_num.y.replace({'no':0, 'yes':1}, inplace=True)
df_num
Out[16]:
In [17]:
level_substitution
Out[17]:
In [ ]:
In [ ]:
In [18]:
X = df_num[list(all_features)].as_matrix()
y = df_num.y.as_matrix()
X, y
Out[18]:
In [19]:
### Scikit-learn provides us with a nice function to split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
In [20]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=5)
In [21]:
clf.fit(X_train, y_train)
score_train = clf.score(X_train, y_train)
score_test = clf.score(X_test, y_test)
print('Ratio of correctly classified samples for:\n\tTraining-set:\t%f\n\tTest-set:\t%f'%(score_train, score_test))
score
returns the mean accuracy on the given test data and labels. In multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted. For binary classification it means percentage of correctly classified samples.
The score should be close to 1. Though, one single number does not tell the whole story...
In [33]:
import sklearn.tree
import pydot_ng as pdot
dot_data = sklearn.tree.export_graphviz(clf, out_file=None, feature_names = list(all_features), class_names=['no', 'yes'])
graph = pdot.graph_from_dot_data(dot_data)
#--- we can save the graph into a file ... preferrably vector graphics
#graph.write_svg('mydt.svg')
graph.write_pdf('/home/pmolnar/public_html/mydt.pdf')
#--- or display right here
##from IPython.display import HTML
HTML(str(graph.create_svg().decode('utf-8')))
Out[33]:
Now, we use out classifier and predict on the test set (In order to get the ŷ
character type: 'y\hat' followed by the TAB-key.)
In [ ]:
ŷ = clf.predict(X_test)
In [ ]:
## a function that produces the confusion matrix: 1. parameter y=actual target, 2. parameter ŷ=predicted
def binary_confusion_matrix(y,ŷ):
TP = ((y+ŷ)== 2).sum()
TN = ((y+ŷ)== 0).sum()
FP = ((y-ŷ)== -1).sum()
FN = ((y-ŷ)== 1).sum()
return pd.DataFrame( [[TP, FP], [FN, TN]], index=[['Prediction', 'Prediction'],['Yes', 'No']], columns=[['Actual', 'Actual'],['Yes', 'No']])
cm = binary_confusion_matrix(y_test, ŷ)
cm
In [ ]:
### Scikit-Learn can do that too ... so so nice though
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, ŷ)
cm
In [ ]:
### Here are some metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, ŷ))
In [ ]:
In [ ]:
### http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
import itertools
np.set_printoptions(precision=2)
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [ ]:
%matplotlib inline
fig = plt.figure()
plot_confusion_matrix(cm, classes=['No', 'Yes'], normalize=True, title='Normalized confusion matrix')
plt.show()
This is an experiemnt. What can we change to improve the performance of the model?
In [ ]: