In [1]:
# %load nbinit.py
from IPython.display import HTML
HTML("<style>.container { width: 100% !important; padding-left: 1em; padding-right: 2em; } div.output_stderr { background: #FFA; }</style>")
Out[1]:
Let's see how well a decision tree can classify the data. Hereby we need to consider
Once the dataset is loaded we will convert the categorical data into numeric values.
Finding the right parameters and features for the best performing classifier can be a challenge. The number of possible configurations grows quickly, and knowing how they perform requires training and testing with each of them.
We may also run the training and testing on a configuration multiple times with different random splits of the data set. The performance metrics will be avaraged over the iterations.
We use percision, recall, and the F1 score to evaluate each configuration.
In [29]:
### Load Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree
import pydot_ng as pdot
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
import itertools
In [2]:
### Read data
DATAFILE = '/home/data/archive.ics.uci.edu/BankMarketing/bank.csv'
df = pd.read_csv(DATAFILE, sep=';')
In [3]:
### use sets and '-' difference operation 'A-B'. Also there is a symmetric different '^'
all_features = set(df.columns)-set(['y'])
num_features = set(df.describe().columns)
cat_features = all_features-num_features
print("All features: ", ", ".join(all_features), "\nNumerical features: ", ", ".join(num_features), "\nCategorical features: ", ", ".join(cat_features))
In [30]:
### convert to categorical variables to numeric ones
level_substitution = {}
def levels2index(levels):
dct = {}
for i in range(len(levels)):
dct[levels[i]] = i
return dct
df_num = df.copy()
for c in cat_features:
level_substitution[c] = levels2index(df[c].unique())
df_num[c].replace(level_substitution[c], inplace=True)
## same for target
df_num.y.replace({'no':0, 'yes':1}, inplace=True)
df_num
Out[30]:
In [33]:
### create feature matrix and target vector
X = df_num[list(all_features-set(['day', 'month']))].as_matrix()
y = df_num.y.as_matrix()
X, y
Out[33]:
In [34]:
for d in [3, 5, 7, 11, 13]:
clf = DecisionTreeClassifier(max_depth=d)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
clf.fit(X_train, y_train)
ŷ = clf.predict(X_test)
print('Depth %d' % d)
print(classification_report(y_test, ŷ))
Two methods from sklearn.metrics can be helpful:
confusion_matrix produces a confusion matrixprecision_recall_fscore_support returns a matrix with values for each of them across all target levels.
In [36]:
cm = confusion_matrix(y_test, ŷ)
cm
Out[36]:
In [37]:
prf1s = precision_recall_fscore_support(y_test, ŷ)
prf1s
Out[37]:
In [10]:
perf = None
for i in range(100):
if type(perf)!=type(None):
perf = np.vstack((perf, np.array(prf1s).reshape(1,8)))
else:
perf = np.array(prf1s).reshape(1,8)
perf_agg = perf.mean(axis=0)
pd.DataFrame(perf_agg.reshape(1,8), columns=[[b for a in ['Precision', 'Recall', 'F1_score', 'Support'] for b in [a, a]], ['no', 'yes']*4])
##pd.DataFrame([5,5, 'a|b|c'] + list(perf.mean(axis=0)), columns=perf_df.columns)
Out[10]:
In [14]:
performance_df = pd.DataFrame(columns=[
['Params']*3 + [b for a in ['Precision', 'Recall', 'F1_score', 'Support'] for b in [a, a]],
['MaxDepth', 'Nfeature', 'Features'] + ['no', 'yes']*4
])
tempdf = pd.concat([
pd.DataFrame({'a': [1], 'b': [2], 'c': ['Hello']}),
pd.DataFrame(np.zeros((1,8)))
], axis=1, ignore_index=True)
tempdf.columns=performance_df.columns
#performance_df
tempdf
Out[14]:
In [274]:
pd.DataFrame(np.zeros(8).reshape(1,8))
Out[274]:
In [41]:
# creating a template (i.e. empty table)
performance_template_df = pd.DataFrame(columns= [
['Params']*3 + [b for a in ['Precision', 'Recall', 'F1_score', 'Support'] for b in [a, a]],
['MaxDepth', 'Nfeature', 'Features'] + ['no', 'yes']*4
])
performance_template_df
Out[41]:
The following code implements nested loops for MaxDepth, number and permutation of features. In addition, we have an internal loop to aggregate the performance metrics over a number of different random splits.
The outer two loops, however, only iterate over one value each. The commmented code shows how they should run...
In [42]:
%%time
performance_df = performance_template_df.copy() #-- always start fresh
for MaxDepth in [5]: ###range(5,9):
for Nftr in [8]: ###[len(all_features) - k for k in range(len(all_features)-2))]:
for ftrs in itertools.combinations(all_features-set(['day', 'month']), Nftr):
X = df_num[list(ftrs)].as_matrix()
clf = DecisionTreeClassifier(max_depth=MaxDepth)
perf_arr = None #-- this array will hold results for different random samples
for i in range(10): ### running train and test on different random samples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=i)
clf.fit(X_train, y_train)
ŷ = clf.predict(X_test)
#Prec, Recall, F1, Supp
prf1s = precision_recall_fscore_support(y_test, ŷ)
##
if type(perf_arr)!=type(None):
perf_arr = np.vstack((perf, np.array(prf1s).reshape(1,8)))
else:
perf_arr = np.array(prf1s).reshape(1,8)
perf_agg = perf_arr.mean(axis=0) #-- mean over rows, for each column
perf_df = pd.concat([ #-- creating a 1 row dataframe is a bit tricky because of the different data types
pd.DataFrame({'a': [MaxDepth], 'b': [Nftr], 'c': ['|'.join(list(ftrs))]}),
pd.DataFrame(perf_agg.reshape(1, 8))
], axis=1, ignore_index=True)
perf_df.columns=performance_df.columns
performance_df = performance_df.append(perf_df, ignore_index=True)
In [43]:
performance_df
Out[43]:
That took a while (about 2 minutes). Once computations take that long we should look at a different way to implement them ... outside the notebook .
Let's see what the best performing configuration with respect to the F1-score of 'yes' is:
In [61]:
best = performance_df.F1_score.yes.argmax()
print(performance_df.iloc[best])
print("\nFeatures: ", ', '.join([ '"%s"'%f for f in performance_df.iloc[best].Params.Features.split('|') ], ))
In [ ]: