In [103]:
!date
import numpy as np, matplotlib.pyplot as plt, pandas as pd, seaborn as sns
%matplotlib inline
sns.set_context('talk')
sns.set_style('darkgrid')


Sun Mar  8 16:31:09 PDT 2015

As an example of the ensemble methods, I wil do an simpliefied version of Max's project, since I love it.


In [104]:
!ls -hal /home/j/Project/IRH/DEX/USA/EXPLORATORY/ML_Injury/CMS_SNF_complete.csv


-rw-rw---- 1 mlbirger Domain Users 21M Mar  1 15:06 /home/j/Project/IRH/DEX/USA/EXPLORATORY/ML_Injury/CMS_SNF_complete.csv

In [105]:
df = pd.read_csv('/home/j/Project/IRH/DEX/USA/EXPLORATORY/ML_Injury/CMS_SNF_complete.csv')

In [106]:
df.shape


Out[106]:
(145644, 59)

In [107]:
df.head()


Out[107]:
year age sex acause ecode i_N11 i_N12 i_N13 i_N14 i_N15 ... i_N5 i_N35 i_N6 i_N2 metric_vol_counts1 metric_vol_beddays1 metric_vol_mdcr_days1 metric_exp_mdcr_payments1 metric_exp_nonmdcr_payments1 metric_exp_charges1
0 1999 40 2 inj_animal E9064 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.00000 0 0.00000
1 1999 65 1 inj_animal E9068 0 0 0 0 0 ... 0 0 0 0 2 2 2 495.08209 0 3693.00610
2 1999 70 1 inj_animal E9068 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.00000 0 0.00000
3 1999 70 2 inj_animal E9069 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.00000 0 0.00000
4 1999 75 1 inj_animal E9065 0 0 0 0 1 ... 0 0 0 0 2 2 2 725.15808 0 467.14902

5 rows × 59 columns


In [108]:
df.acause.value_counts()


Out[108]:
inj_falls          88354
inj_trans_other    25966
inj_medical        22296
inj_trans_road      3770
inj_othunintent     1430
inj_mech             835
inj_poisoning        689
inj_fires            580
inj_foreign          462
inj_animal           336
inj_suicide          285
mental_drug          257
inj_homicide         186
inj_disaster         165
inj_war               27
inj_drowning           4
mental_alcohol         2
dtype: int64

In [109]:
# for a stripped down version of Max's project, let's try to predict if an injury was caused by a fall
y = np.array(df.acause == 'inj_falls')

In [110]:
# as predictors, we will use the year, age, sex, and indicators for all N-codes
i_cols = df.filter(like='i_N').columns

X = np.array(df.filter(['year', 'age', 'sex'] + list(i_cols)))

In [111]:
# this is a nice, big dataset
X.shape, y.shape


Out[111]:
((145644, 51), (145644,))

In [149]:
# set random seed for reproducibility
np.random.seed(123456)

# resample X and y to have equal number of examples from both classes
i_true, = np.where(y == True)
i_false, = np.where(y == False)

# n=1000 positive and negative examples should run quickly for class
n=10*1000
i_resample = list(np.random.choice(i_true, size=n)) + list(np.random.choice(i_false, size=n))
X = X[i_resample]
y = y[i_resample]

In [150]:
# class frequencies
np.unique(y)


Out[150]:
array([False,  True], dtype=bool)

In [151]:
np.mean(y)


Out[151]:
0.5

In [152]:
# so a baseline classifier of always saying False has accuracy of
np.mean(y == False)


Out[152]:
0.5

In [153]:
# How does logistic regression do?
import sklearn.linear_model, sklearn.cross_validation
c1 = sklearn.linear_model.LogisticRegression()

cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c1, X, y, cv=cv, ))


CPU times: user 1.1 s, sys: 0 ns, total: 1.1 s
Wall time: 1.1 s
Out[153]:
0.69510000000000005

Not too bad, but can we do better?


In [154]:
# I would try Naive Bayes for this, and might as well do decision trees, too
# How does logistic regression do?
import sklearn.naive_bayes, sklearn.tree

c2 = sklearn.naive_bayes.BernoulliNB()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c2, X, y, cv=cv))


CPU times: user 390 ms, sys: 0 ns, total: 390 ms
Wall time: 390 ms
Out[154]:
0.69779999999999998

In [155]:
c3 = sklearn.tree.DecisionTreeClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c3, X, y, cv=cv))


CPU times: user 930 ms, sys: 0 ns, total: 930 ms
Wall time: 931 ms
Out[155]:
0.8516999999999999

Will we be able to do any better with our fancy ensemble models? I sure hope so!

Bagging

This algorithm encompasses several works from the literature. When random
subsets of the dataset are drawn as random subsets of the samples, then
this algorithm is known as Pasting [1]_. If samples are drawn with
replacement, then the method is known as Bagging [2]_. When random subsets
of the dataset are drawn as random subsets of the features, then the method
is known as Random Subspaces [3]_. Finally, when base estimators are built
on subsets of both samples and features, then the method is known as
Random Patches [4]_.

In [156]:
import sklearn.ensemble
c4 = sklearn.ensemble.BaggingClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c4, X, y, cv=cv))


CPU times: user 9.4 s, sys: 0 ns, total: 9.4 s
Wall time: 9.41 s
Out[156]:
0.85214999999999996

Test your understanding: what happens if you Bag with Logistic Regression?


In [130]:
c = sklearn.ensemble.BaggingClassifier(base_estimator=sklearn.linear_model.LogisticRegression())
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))


CPU times: user 1.11 s, sys: 0 ns, total: 1.11 s
Wall time: 1.11 s
Out[130]:
0.68400000000000016

What about with "decision stumps" (aka one-R)?


In [131]:
c = sklearn.ensemble.BaggingClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=1))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))


CPU times: user 254 ms, sys: 15 µs, total: 254 ms
Wall time: 253 ms
Out[131]:
0.64549999999999996

Random Subspaces


In [120]:
c5 = sklearn.ensemble.BaggingClassifier(max_samples=1., bootstrap=False, max_features=10)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c5, X, y, cv=cv))


CPU times: user 168 ms, sys: 0 ns, total: 168 ms
Wall time: 168 ms
Out[120]:
0.68800000000000006

Random Patches


In [121]:
c5 = sklearn.ensemble.BaggingClassifier(max_samples=.1, bootstrap=True, max_features=10)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c5, X, y, cv=cv))


CPU times: user 115 ms, sys: 3.99 ms, total: 119 ms
Wall time: 118 ms
Out[121]:
0.63400000000000001

Random Forests


In [157]:
import sklearn.ensemble
c10 = sklearn.ensemble.RandomForestClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c10, X, y, cv=cv))


CPU times: user 1.53 s, sys: 0 ns, total: 1.53 s
Wall time: 1.53 s
Out[157]:
0.85099999999999998

In [158]:
c11 = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c11, X, y, cv=cv))


CPU times: user 12.4 s, sys: 0 ns, total: 12.4 s
Wall time: 12.4 s
Out[158]:
0.85220000000000007

Boosting


In [159]:
c20 = sklearn.ensemble.AdaBoostClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c20, X, y, cv=cv))


CPU times: user 7.77 s, sys: 0 ns, total: 7.77 s
Wall time: 7.77 s
Out[159]:
0.71784999999999988

This approach was developed with things like decision stumps in mind:


In [160]:
c = sklearn.ensemble.AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=1))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))


CPU times: user 7.77 s, sys: 0 ns, total: 7.77 s
Wall time: 7.77 s
Out[160]:
0.71784999999999988

In [161]:
c = sklearn.ensemble.AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=2))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))


CPU times: user 12.4 s, sys: 0 ns, total: 12.4 s
Wall time: 12.4 s
Out[161]:
0.75209999999999988

In [162]:
c = sklearn.ensemble.AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=3))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))


CPU times: user 18 s, sys: 0 ns, total: 18 s
Wall time: 18 s
Out[162]:
0.78915000000000002

Gradient Boosting is beloved by some, although I have not had life-changing success with it, myself:


In [ ]:
c21 = sklearn.ensemble.GradientBoostingClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c21, X, y, cv=cv))


CPU times: user 22 s, sys: 0 ns, total: 22 s
Wall time: 22 s
Out[ ]:
0.74009999999999998

In [ ]:
c = sklearn.ensemble.GradientBoostingClassifier(max_depth=4)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))

In [ ]:
c = sklearn.ensemble.GradientBoostingClassifier(max_depth=5)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))

Stacking


In [ ]:
# (c) 2014 Reid Johnson
#
# Modified from:
# Kemal Eren (https://github.com/kemaleren/scikit-learn/blob/stacking/sklearn/ensemble/stacking.py)
#
# Generates a stacking/blending of base models. Cross-validation is used to 
# generate predictions from base (level-0) models that are used as input to a 
# combiner (level-1) model.

import numpy as np
from itertools import izip
from sklearn.base import ClassifierMixin, RegressorMixin
from sklearn.ensemble.base import BaseEnsemble
from sklearn.utils.validation import assert_all_finite


class Stacking(BaseEnsemble):
    """Implements stacking/blending.

    Parameters
    ----------
    meta_estimator : string or callable
        May be one of "best", "vote", "average", or any classifier or 
        regressor constructor

    estimators : iterator
        An iterable of estimators; each must support predict_proba()

    cv : iterator
        A cross validation object. Base (level-0) estimators are trained on 
        the training folds, then the meta (level-1) estimator is trained on 
        the testing folds.

    stackingc : bool
        Whether to use StackingC or not. For more information, refer to the 
        following paper:

        Reference:
          A. K. Seewald, "How to Make Stacking Better and Faster While Also 
          Taking Care of an Unknown Weakness," 2002.

    kwargs :
        Arguments passed to instantiate meta_estimator.

    References
    ----------
    .. [1] D. H. Wolpert, "Stacked Generalization", 1992.

    """

    # TODO: Support different features for each estimator.
    # TODO: Support "best", "vote", and "average" for already trained model.
    # TODO: Allow saving of estimators, so they need not be retrained when 
    #       trying new stacking methods.

    def __init__(self, meta_estimator, estimators,
                 cv, stackingc=True, proba=True,
                 **kwargs):
        self.estimators_ = estimators
        self.n_estimators_ = len(estimators)
        self.cv_ = cv
        self.stackingc_ = stackingc
        self.proba_ = proba

        if stackingc:
            if isinstance(meta_estimator, str) or not issubclass(meta_estimator, RegressorMixin):
                raise Exception('StackingC only works with a regressor.')

        if isinstance(meta_estimator, str):
            if meta_estimator not in ('best',
                                      'average',
                                      'vote'):
                raise Exception('Invalid meta estimator: {0}'.format(meta_estimator))
            raise Exception('"{0}" meta estimator not implemented'.format(meta_estimator))
        elif issubclass(meta_estimator, ClassifierMixin):
            self.meta_estimator_ = meta_estimator(**kwargs)
        elif issubclass(meta_estimator, RegressorMixin):
            self.meta_estimator_ = MRLR(meta_estimator, stackingc, **kwargs)
        else:
            raise Exception('Invalid meta estimator: {0}'.format(meta_estimator))

    def _base_estimator_predict(self, e, X):
        """Predict label values with the specified estimator on predictor(s) X.

        Parameters
        ----------
        e : int
            The estimator object.

        X : np.ndarray, shape=(n, m)
            The feature data for which to compute the predicted outputs.

        Returns
        -------
        pred : np.ndarray, shape=(len(X), 1)
            The mean of the label probabilities predicted by the specified 
            estimator for each fold for each instance X.
        """
        # Generate array for the base-level testing set, which is n x n_folds.
        pred = e.predict(X)
        assert_all_finite(pred)
        return pred

    def _base_estimator_predict_proba(self, e, X):
        """Predict label probabilities with the specified estimator on 
        predictor(s) X.

        Parameters
        ----------
        e : int
            The estimator object.

        X : np.ndarray, shape=(n, m)
            The feature data for which to compute the predicted outputs.

        Returns
        -------
        pred : np.ndarray, shape=(len(X), 1)
            The mean of the label probabilities predicted by the specified 
            estimator for each fold for each instance X.
        """
        # Generate array for the base-level testing set, which is n x n_folds.
        pred = e.predict_proba(X)
        assert_all_finite(pred)
        return pred

    def _make_meta(self, X):
        """Make the feature set for the meta (level-1) estimator.

        Parameters
        ----------
        X : np.ndarray, shape=(n, m)
            The feature data.

        Returns
        -------
        An n x len(self.estimators_) array of meta-level features.
        """
        rows = []
        for e in self.estimators_:
            if self.proba_:
                # Predict label probabilities
                pred = self._base_estimator_predict_proba(e, X)
            else:
                # Predict label values
                pred = self._base_estimator_predict(e, X)
            rows.append(pred)
        return np.hstack(rows)

    def fit(self, X, y):
        """Fit the estimator given predictor(s) X and target y.

        Parameters
        ----------
        X : np.ndarray, shape=(n, m)
            The feature data on which to fit.

        y : array of shape = [n_samples]
            The actual outputs (class data).
        """
        # Build meta data.
        X_meta = [] # meta-level features
        y_meta = [] # meta-level labels

        print 'Training and validating the base (level-0) estimator(s)...'
        print
        for i, (a, b) in enumerate(self.cv_):
            print 'Fold [%s]' % (i)

            X_a, X_b = X[a], X[b] # training and validation features
            y_a, y_b = y[a], y[b] # training and validation labels

            # Fit each base estimator using the training set for the fold.
            for j, e in enumerate(self.estimators_):
                print '  Training base (level-0) estimator %d...' % (j),
                e.fit(X_a, y_a)
                print 'done.'

            proba = self._make_meta(X_b)
            X_meta.append(proba)
            y_meta.append(y_b)
        print

        X_meta = np.vstack(X_meta)
        if y_meta[0].ndim == 1:
            y_meta = np.hstack(y_meta)
        else:
            y_meta = np.vstack(y_meta)

        # Train meta estimator.
        print 'Training meta (level-1) estimator...',
        self.meta_estimator_.fit(X_meta, y_meta)
        print 'done.'

        # Re-train base estimators on full data.
        for j, e in enumerate(self.estimators_):
            print 'Re-training base (level-0) estimator %d on full data...' % (j),
            e.fit(X, y)
            print 'done.'

    def predict(self, X):
        """Predict label values with the fitted estimator on predictor(s) X.

        Parameters
        ----------
        X : np.ndarray, shape=(n, m)
            The feature data for which to compute the predicted output.

        Returns
        -------
        array of shape = [n_samples]
            The predicted label values of the input samples.
        """
        X_meta = self._make_meta(X)
        return self.meta_estimator_.predict(X_meta)

    def predict_proba(self, X):
        """Predict label probabilities with the fitted estimator on 
        predictor(s) X.

        Parameters
        ----------
        X : np.ndarray, shape=(n, m)
            The feature data for which to compute the predicted output.

        Returns
        -------
        array of shape = [n_samples]
            The predicted label probabilities of the input samples.
        """
        X_meta = self._make_meta(X)
        return self.meta_estimator_.predict_proba(X_meta)

In [ ]:
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.4)

In [ ]:
%%time

clfs = [c1, c2, c3, c4, c5, c10, c11, c20, c21]

# Generate k stratified folds of the training data.
cv = sklearn.cross_validation.StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=123456)
skf = list(cv)  # do you know what this does?  turns an iterator into a list (of lists)... tricky!

stk = Stacking(sklearn.linear_model.LogisticRegression, clfs, skf, stackingc=False, proba=True)
stk.fit(X_train, y_train)

In [ ]:
sklearn.metrics.accuracy_score(y_test, stk.predict(X_test))

How much data did we use? try it with more, 10,000 for each class... or 100,000.


In [129]: