In [103]:
!date
import numpy as np, matplotlib.pyplot as plt, pandas as pd, seaborn as sns
%matplotlib inline
sns.set_context('talk')
sns.set_style('darkgrid')
As an example of the ensemble methods, I wil do an simpliefied version of Max's project, since I love it.
In [104]:
!ls -hal /home/j/Project/IRH/DEX/USA/EXPLORATORY/ML_Injury/CMS_SNF_complete.csv
In [105]:
df = pd.read_csv('/home/j/Project/IRH/DEX/USA/EXPLORATORY/ML_Injury/CMS_SNF_complete.csv')
In [106]:
df.shape
Out[106]:
In [107]:
df.head()
Out[107]:
In [108]:
df.acause.value_counts()
Out[108]:
In [109]:
# for a stripped down version of Max's project, let's try to predict if an injury was caused by a fall
y = np.array(df.acause == 'inj_falls')
In [110]:
# as predictors, we will use the year, age, sex, and indicators for all N-codes
i_cols = df.filter(like='i_N').columns
X = np.array(df.filter(['year', 'age', 'sex'] + list(i_cols)))
In [111]:
# this is a nice, big dataset
X.shape, y.shape
Out[111]:
In [149]:
# set random seed for reproducibility
np.random.seed(123456)
# resample X and y to have equal number of examples from both classes
i_true, = np.where(y == True)
i_false, = np.where(y == False)
# n=1000 positive and negative examples should run quickly for class
n=10*1000
i_resample = list(np.random.choice(i_true, size=n)) + list(np.random.choice(i_false, size=n))
X = X[i_resample]
y = y[i_resample]
In [150]:
# class frequencies
np.unique(y)
Out[150]:
In [151]:
np.mean(y)
Out[151]:
In [152]:
# so a baseline classifier of always saying False has accuracy of
np.mean(y == False)
Out[152]:
In [153]:
# How does logistic regression do?
import sklearn.linear_model, sklearn.cross_validation
c1 = sklearn.linear_model.LogisticRegression()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c1, X, y, cv=cv, ))
Out[153]:
Not too bad, but can we do better?
In [154]:
# I would try Naive Bayes for this, and might as well do decision trees, too
# How does logistic regression do?
import sklearn.naive_bayes, sklearn.tree
c2 = sklearn.naive_bayes.BernoulliNB()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c2, X, y, cv=cv))
Out[154]:
In [155]:
c3 = sklearn.tree.DecisionTreeClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c3, X, y, cv=cv))
Out[155]:
Will we be able to do any better with our fancy ensemble models? I sure hope so!
This algorithm encompasses several works from the literature. When random
subsets of the dataset are drawn as random subsets of the samples, then
this algorithm is known as Pasting [1]_. If samples are drawn with
replacement, then the method is known as Bagging [2]_. When random subsets
of the dataset are drawn as random subsets of the features, then the method
is known as Random Subspaces [3]_. Finally, when base estimators are built
on subsets of both samples and features, then the method is known as
Random Patches [4]_.
In [156]:
import sklearn.ensemble
c4 = sklearn.ensemble.BaggingClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c4, X, y, cv=cv))
Out[156]:
Test your understanding: what happens if you Bag with Logistic Regression?
In [130]:
c = sklearn.ensemble.BaggingClassifier(base_estimator=sklearn.linear_model.LogisticRegression())
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
Out[130]:
What about with "decision stumps" (aka one-R)?
In [131]:
c = sklearn.ensemble.BaggingClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=1))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
Out[131]:
In [120]:
c5 = sklearn.ensemble.BaggingClassifier(max_samples=1., bootstrap=False, max_features=10)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c5, X, y, cv=cv))
Out[120]:
In [121]:
c5 = sklearn.ensemble.BaggingClassifier(max_samples=.1, bootstrap=True, max_features=10)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c5, X, y, cv=cv))
Out[121]:
In [157]:
import sklearn.ensemble
c10 = sklearn.ensemble.RandomForestClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c10, X, y, cv=cv))
Out[157]:
In [158]:
c11 = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c11, X, y, cv=cv))
Out[158]:
In [159]:
c20 = sklearn.ensemble.AdaBoostClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c20, X, y, cv=cv))
Out[159]:
This approach was developed with things like decision stumps in mind:
In [160]:
c = sklearn.ensemble.AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=1))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
Out[160]:
In [161]:
c = sklearn.ensemble.AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=2))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
Out[161]:
In [162]:
c = sklearn.ensemble.AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=3))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
Out[162]:
Gradient Boosting is beloved by some, although I have not had life-changing success with it, myself:
In [ ]:
c21 = sklearn.ensemble.GradientBoostingClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c21, X, y, cv=cv))
Out[ ]:
In [ ]:
c = sklearn.ensemble.GradientBoostingClassifier(max_depth=4)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
In [ ]:
c = sklearn.ensemble.GradientBoostingClassifier(max_depth=5)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
In [ ]:
# (c) 2014 Reid Johnson
#
# Modified from:
# Kemal Eren (https://github.com/kemaleren/scikit-learn/blob/stacking/sklearn/ensemble/stacking.py)
#
# Generates a stacking/blending of base models. Cross-validation is used to
# generate predictions from base (level-0) models that are used as input to a
# combiner (level-1) model.
import numpy as np
from itertools import izip
from sklearn.base import ClassifierMixin, RegressorMixin
from sklearn.ensemble.base import BaseEnsemble
from sklearn.utils.validation import assert_all_finite
class Stacking(BaseEnsemble):
"""Implements stacking/blending.
Parameters
----------
meta_estimator : string or callable
May be one of "best", "vote", "average", or any classifier or
regressor constructor
estimators : iterator
An iterable of estimators; each must support predict_proba()
cv : iterator
A cross validation object. Base (level-0) estimators are trained on
the training folds, then the meta (level-1) estimator is trained on
the testing folds.
stackingc : bool
Whether to use StackingC or not. For more information, refer to the
following paper:
Reference:
A. K. Seewald, "How to Make Stacking Better and Faster While Also
Taking Care of an Unknown Weakness," 2002.
kwargs :
Arguments passed to instantiate meta_estimator.
References
----------
.. [1] D. H. Wolpert, "Stacked Generalization", 1992.
"""
# TODO: Support different features for each estimator.
# TODO: Support "best", "vote", and "average" for already trained model.
# TODO: Allow saving of estimators, so they need not be retrained when
# trying new stacking methods.
def __init__(self, meta_estimator, estimators,
cv, stackingc=True, proba=True,
**kwargs):
self.estimators_ = estimators
self.n_estimators_ = len(estimators)
self.cv_ = cv
self.stackingc_ = stackingc
self.proba_ = proba
if stackingc:
if isinstance(meta_estimator, str) or not issubclass(meta_estimator, RegressorMixin):
raise Exception('StackingC only works with a regressor.')
if isinstance(meta_estimator, str):
if meta_estimator not in ('best',
'average',
'vote'):
raise Exception('Invalid meta estimator: {0}'.format(meta_estimator))
raise Exception('"{0}" meta estimator not implemented'.format(meta_estimator))
elif issubclass(meta_estimator, ClassifierMixin):
self.meta_estimator_ = meta_estimator(**kwargs)
elif issubclass(meta_estimator, RegressorMixin):
self.meta_estimator_ = MRLR(meta_estimator, stackingc, **kwargs)
else:
raise Exception('Invalid meta estimator: {0}'.format(meta_estimator))
def _base_estimator_predict(self, e, X):
"""Predict label values with the specified estimator on predictor(s) X.
Parameters
----------
e : int
The estimator object.
X : np.ndarray, shape=(n, m)
The feature data for which to compute the predicted outputs.
Returns
-------
pred : np.ndarray, shape=(len(X), 1)
The mean of the label probabilities predicted by the specified
estimator for each fold for each instance X.
"""
# Generate array for the base-level testing set, which is n x n_folds.
pred = e.predict(X)
assert_all_finite(pred)
return pred
def _base_estimator_predict_proba(self, e, X):
"""Predict label probabilities with the specified estimator on
predictor(s) X.
Parameters
----------
e : int
The estimator object.
X : np.ndarray, shape=(n, m)
The feature data for which to compute the predicted outputs.
Returns
-------
pred : np.ndarray, shape=(len(X), 1)
The mean of the label probabilities predicted by the specified
estimator for each fold for each instance X.
"""
# Generate array for the base-level testing set, which is n x n_folds.
pred = e.predict_proba(X)
assert_all_finite(pred)
return pred
def _make_meta(self, X):
"""Make the feature set for the meta (level-1) estimator.
Parameters
----------
X : np.ndarray, shape=(n, m)
The feature data.
Returns
-------
An n x len(self.estimators_) array of meta-level features.
"""
rows = []
for e in self.estimators_:
if self.proba_:
# Predict label probabilities
pred = self._base_estimator_predict_proba(e, X)
else:
# Predict label values
pred = self._base_estimator_predict(e, X)
rows.append(pred)
return np.hstack(rows)
def fit(self, X, y):
"""Fit the estimator given predictor(s) X and target y.
Parameters
----------
X : np.ndarray, shape=(n, m)
The feature data on which to fit.
y : array of shape = [n_samples]
The actual outputs (class data).
"""
# Build meta data.
X_meta = [] # meta-level features
y_meta = [] # meta-level labels
print 'Training and validating the base (level-0) estimator(s)...'
print
for i, (a, b) in enumerate(self.cv_):
print 'Fold [%s]' % (i)
X_a, X_b = X[a], X[b] # training and validation features
y_a, y_b = y[a], y[b] # training and validation labels
# Fit each base estimator using the training set for the fold.
for j, e in enumerate(self.estimators_):
print ' Training base (level-0) estimator %d...' % (j),
e.fit(X_a, y_a)
print 'done.'
proba = self._make_meta(X_b)
X_meta.append(proba)
y_meta.append(y_b)
print
X_meta = np.vstack(X_meta)
if y_meta[0].ndim == 1:
y_meta = np.hstack(y_meta)
else:
y_meta = np.vstack(y_meta)
# Train meta estimator.
print 'Training meta (level-1) estimator...',
self.meta_estimator_.fit(X_meta, y_meta)
print 'done.'
# Re-train base estimators on full data.
for j, e in enumerate(self.estimators_):
print 'Re-training base (level-0) estimator %d on full data...' % (j),
e.fit(X, y)
print 'done.'
def predict(self, X):
"""Predict label values with the fitted estimator on predictor(s) X.
Parameters
----------
X : np.ndarray, shape=(n, m)
The feature data for which to compute the predicted output.
Returns
-------
array of shape = [n_samples]
The predicted label values of the input samples.
"""
X_meta = self._make_meta(X)
return self.meta_estimator_.predict(X_meta)
def predict_proba(self, X):
"""Predict label probabilities with the fitted estimator on
predictor(s) X.
Parameters
----------
X : np.ndarray, shape=(n, m)
The feature data for which to compute the predicted output.
Returns
-------
array of shape = [n_samples]
The predicted label probabilities of the input samples.
"""
X_meta = self._make_meta(X)
return self.meta_estimator_.predict_proba(X_meta)
In [ ]:
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.4)
In [ ]:
%%time
clfs = [c1, c2, c3, c4, c5, c10, c11, c20, c21]
# Generate k stratified folds of the training data.
cv = sklearn.cross_validation.StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=123456)
skf = list(cv) # do you know what this does? turns an iterator into a list (of lists)... tricky!
stk = Stacking(sklearn.linear_model.LogisticRegression, clfs, skf, stackingc=False, proba=True)
stk.fit(X_train, y_train)
In [ ]:
sklearn.metrics.accuracy_score(y_test, stk.predict(X_test))
How much data did we use? try it with more, 10,000 for each class... or 100,000.
In [129]: