In previous posts we've looked at single classifiers in action. Let's asses in this post a voting class of classifiers.
Logistic regression comes first.
In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.cross_validation import train_test_split
digits = datasets.load_digits()
# split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split( digits.data,
digits.target,
test_size=0.33 )
Cs = np.logspace(-4., 4., 20)
logreg_cv = linear_model.LogisticRegressionCV( Cs=Cs )
logreg_cv.fit( X_train, y_train )
print( 'Logistic Regression: C:{0}, score:{1}'.format( np.average( logreg_cv.C_ ),
logreg_cv.score( X_train, y_train ) ) )
Now we will set up KNN classifier for the train set
In [2]:
from sklearn import datasets, neighbors
# use default number of neighbors (5)
nbor = neighbors.KNeighborsClassifier()
# fit the predictor
nbor.fit( X_train, y_train )
print( 'KNN Test score:{0}'.format( nbor.score( X_train, y_train ) ) )
The last one will be SVM classifier with linear kernel. We are going to use a "bad" value for regularization parameter.
In [3]:
from sklearn import svm
# we will use default value for C here for a moment
clf_svm = svm.LinearSVC(penalty='l1', loss='squared_hinge',
dual=False, tol=1e-3, C=1e-3 )
clf_svm.fit( X_train, y_train )
print( 'Lin SVC: Test score:{0}'.format( clf_svm.score( X_train, y_train ) ) )
Let's define a voting classifier. The code is copied from sklearn source for 0.17.0 and credit goes to Sebastian Raschka @rasbt.
Comments and docstrings are removed for readability :)
In [4]:
import operator
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
class VotingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
def __init__(self, clfs, voting='hard', weights=None):
self.clfs = clfs
self.named_clfs = {key:value for key,value in _name_estimators(clfs)}
self.voting = voting
self.weights = weights
def fit(self, X, y):
if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
raise NotImplementedError('Multilabel and multi-output'\
' classification is not supported.')
if self.voting not in ('soft', 'hard'):
raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
% voting)
if self.weights and len(self.weights) != len(self.clfs):
raise ValueError('Number of classifiers and weights must be equal'
'; got %d weights, %d clfs'
% (len(self.weights), len(self.clfs)))
self.le_ = LabelEncoder()
self.le_.fit(y)
self.classes_ = self.le_.classes_
self.clfs_ = []
for clf in self.clfs:
fitted_clf = clone(clf).fit(X, self.le_.transform(y))
self.clfs_.append(fitted_clf)
return self
def predict(self, X):
if self.voting == 'soft':
maj = np.argmax(self.predict_proba(X), axis=1)
else: # 'hard' voting
predictions = self._predict(X)
maj = np.apply_along_axis(
lambda x:
np.argmax(np.bincount(x,
weights=self.weights)),
axis=1,
arr=predictions)
maj = self.le_.inverse_transform(maj)
return maj
def predict_proba(self, X):
avg = np.average(self._predict_probas(X), axis=0, weights=self.weights)
return avg
def transform(self, X):
if self.voting == 'soft':
return self._predict_probas(X)
else:
return self._predict(X)
def get_params(self, deep=True):
if not deep:
return super(EnsembleClassifier, self).get_params(deep=False)
else:
out = self.named_clfs.copy()
for name, step in six.iteritems(self.named_clfs):
for key, value in six.iteritems(step.get_params(deep=True)):
out['%s__%s' % (name, key)] = value
return out
def _predict(self, X):
return np.asarray([clf.predict(X) for clf in self.clfs]).T
def _predict_probas(self, X):
return np.asarray([clf.predict_proba(X) for clf in self.clfs])
Let's put all these together using VotingClassifier
In [5]:
estimators = ( logreg_cv, nbor, clf_svm )
vclf = VotingClassifier( clfs=estimators,
voting='hard',
weights=[1, 1, 1])
vclf.fit( X_train, y_train )
for e in _name_estimators( estimators ):
print( '{0} score on test data: {1}'.format( e[0], e[1].score( X_test, y_test ) ) )
# final test
print( '\n' )
print( 'hard voting: Test score:{0}'.format( vclf.score( X_test, y_test ) ) )