In [103]:
    
!date
import numpy as np, matplotlib.pyplot as plt, pandas as pd, seaborn as sns
%matplotlib inline
sns.set_context('talk')
sns.set_style('darkgrid')
    
    
As an example of the ensemble methods, I wil do an simpliefied version of Max's project, since I love it.
In [104]:
    
!ls -hal /home/j/Project/IRH/DEX/USA/EXPLORATORY/ML_Injury/CMS_SNF_complete.csv
    
    
In [105]:
    
df = pd.read_csv('/home/j/Project/IRH/DEX/USA/EXPLORATORY/ML_Injury/CMS_SNF_complete.csv')
    
In [106]:
    
df.shape
    
    Out[106]:
In [107]:
    
df.head()
    
    Out[107]:
In [108]:
    
df.acause.value_counts()
    
    Out[108]:
In [109]:
    
# for a stripped down version of Max's project, let's try to predict if an injury was caused by a fall
y = np.array(df.acause == 'inj_falls')
    
In [110]:
    
# as predictors, we will use the year, age, sex, and indicators for all N-codes
i_cols = df.filter(like='i_N').columns
X = np.array(df.filter(['year', 'age', 'sex'] + list(i_cols)))
    
In [111]:
    
# this is a nice, big dataset
X.shape, y.shape
    
    Out[111]:
In [149]:
    
# set random seed for reproducibility
np.random.seed(123456)
# resample X and y to have equal number of examples from both classes
i_true, = np.where(y == True)
i_false, = np.where(y == False)
# n=1000 positive and negative examples should run quickly for class
n=10*1000
i_resample = list(np.random.choice(i_true, size=n)) + list(np.random.choice(i_false, size=n))
X = X[i_resample]
y = y[i_resample]
    
In [150]:
    
# class frequencies
np.unique(y)
    
    Out[150]:
In [151]:
    
np.mean(y)
    
    Out[151]:
In [152]:
    
# so a baseline classifier of always saying False has accuracy of
np.mean(y == False)
    
    Out[152]:
In [153]:
    
# How does logistic regression do?
import sklearn.linear_model, sklearn.cross_validation
c1 = sklearn.linear_model.LogisticRegression()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c1, X, y, cv=cv, ))
    
    
    Out[153]:
Not too bad, but can we do better?
In [154]:
    
# I would try Naive Bayes for this, and might as well do decision trees, too
# How does logistic regression do?
import sklearn.naive_bayes, sklearn.tree
c2 = sklearn.naive_bayes.BernoulliNB()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c2, X, y, cv=cv))
    
    
    Out[154]:
In [155]:
    
c3 = sklearn.tree.DecisionTreeClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c3, X, y, cv=cv))
    
    
    Out[155]:
Will we be able to do any better with our fancy ensemble models? I sure hope so!
This algorithm encompasses several works from the literature. When random
subsets of the dataset are drawn as random subsets of the samples, then
this algorithm is known as Pasting [1]_. If samples are drawn with
replacement, then the method is known as Bagging [2]_. When random subsets
of the dataset are drawn as random subsets of the features, then the method
is known as Random Subspaces [3]_. Finally, when base estimators are built
on subsets of both samples and features, then the method is known as
Random Patches [4]_.
In [156]:
    
import sklearn.ensemble
c4 = sklearn.ensemble.BaggingClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c4, X, y, cv=cv))
    
    
    Out[156]:
Test your understanding: what happens if you Bag with Logistic Regression?
In [130]:
    
c = sklearn.ensemble.BaggingClassifier(base_estimator=sklearn.linear_model.LogisticRegression())
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
    
    
    Out[130]:
What about with "decision stumps" (aka one-R)?
In [131]:
    
c = sklearn.ensemble.BaggingClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=1))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
    
    
    Out[131]:
In [120]:
    
c5 = sklearn.ensemble.BaggingClassifier(max_samples=1., bootstrap=False, max_features=10)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c5, X, y, cv=cv))
    
    
    Out[120]:
In [121]:
    
c5 = sklearn.ensemble.BaggingClassifier(max_samples=.1, bootstrap=True, max_features=10)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c5, X, y, cv=cv))
    
    
    Out[121]:
In [157]:
    
import sklearn.ensemble
c10 = sklearn.ensemble.RandomForestClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c10, X, y, cv=cv))
    
    
    Out[157]:
In [158]:
    
c11 = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c11, X, y, cv=cv))
    
    
    Out[158]:
In [159]:
    
c20 = sklearn.ensemble.AdaBoostClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c20, X, y, cv=cv))
    
    
    Out[159]:
This approach was developed with things like decision stumps in mind:
In [160]:
    
c = sklearn.ensemble.AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=1))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
    
    
    Out[160]:
In [161]:
    
c = sklearn.ensemble.AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=2))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
    
    
    Out[161]:
In [162]:
    
c = sklearn.ensemble.AdaBoostClassifier(base_estimator=sklearn.tree.DecisionTreeClassifier(max_depth=3))
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
    
    
    Out[162]:
Gradient Boosting is beloved by some, although I have not had life-changing success with it, myself:
In [ ]:
    
c21 = sklearn.ensemble.GradientBoostingClassifier()
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c21, X, y, cv=cv))
    
    
    Out[ ]:
In [ ]:
    
c = sklearn.ensemble.GradientBoostingClassifier(max_depth=4)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
    
In [ ]:
    
c = sklearn.ensemble.GradientBoostingClassifier(max_depth=5)
cv = sklearn.cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True, random_state=123456)
%time np.mean(sklearn.cross_validation.cross_val_score(c, X, y, cv=cv))
    
In [ ]:
    
# (c) 2014 Reid Johnson
#
# Modified from:
# Kemal Eren (https://github.com/kemaleren/scikit-learn/blob/stacking/sklearn/ensemble/stacking.py)
#
# Generates a stacking/blending of base models. Cross-validation is used to 
# generate predictions from base (level-0) models that are used as input to a 
# combiner (level-1) model.
import numpy as np
from itertools import izip
from sklearn.base import ClassifierMixin, RegressorMixin
from sklearn.ensemble.base import BaseEnsemble
from sklearn.utils.validation import assert_all_finite
class Stacking(BaseEnsemble):
    """Implements stacking/blending.
    Parameters
    ----------
    meta_estimator : string or callable
        May be one of "best", "vote", "average", or any classifier or 
        regressor constructor
    estimators : iterator
        An iterable of estimators; each must support predict_proba()
    cv : iterator
        A cross validation object. Base (level-0) estimators are trained on 
        the training folds, then the meta (level-1) estimator is trained on 
        the testing folds.
    stackingc : bool
        Whether to use StackingC or not. For more information, refer to the 
        following paper:
        Reference:
          A. K. Seewald, "How to Make Stacking Better and Faster While Also 
          Taking Care of an Unknown Weakness," 2002.
    kwargs :
        Arguments passed to instantiate meta_estimator.
    References
    ----------
    .. [1] D. H. Wolpert, "Stacked Generalization", 1992.
    """
    # TODO: Support different features for each estimator.
    # TODO: Support "best", "vote", and "average" for already trained model.
    # TODO: Allow saving of estimators, so they need not be retrained when 
    #       trying new stacking methods.
    def __init__(self, meta_estimator, estimators,
                 cv, stackingc=True, proba=True,
                 **kwargs):
        self.estimators_ = estimators
        self.n_estimators_ = len(estimators)
        self.cv_ = cv
        self.stackingc_ = stackingc
        self.proba_ = proba
        if stackingc:
            if isinstance(meta_estimator, str) or not issubclass(meta_estimator, RegressorMixin):
                raise Exception('StackingC only works with a regressor.')
        if isinstance(meta_estimator, str):
            if meta_estimator not in ('best',
                                      'average',
                                      'vote'):
                raise Exception('Invalid meta estimator: {0}'.format(meta_estimator))
            raise Exception('"{0}" meta estimator not implemented'.format(meta_estimator))
        elif issubclass(meta_estimator, ClassifierMixin):
            self.meta_estimator_ = meta_estimator(**kwargs)
        elif issubclass(meta_estimator, RegressorMixin):
            self.meta_estimator_ = MRLR(meta_estimator, stackingc, **kwargs)
        else:
            raise Exception('Invalid meta estimator: {0}'.format(meta_estimator))
    def _base_estimator_predict(self, e, X):
        """Predict label values with the specified estimator on predictor(s) X.
        Parameters
        ----------
        e : int
            The estimator object.
        X : np.ndarray, shape=(n, m)
            The feature data for which to compute the predicted outputs.
        Returns
        -------
        pred : np.ndarray, shape=(len(X), 1)
            The mean of the label probabilities predicted by the specified 
            estimator for each fold for each instance X.
        """
        # Generate array for the base-level testing set, which is n x n_folds.
        pred = e.predict(X)
        assert_all_finite(pred)
        return pred
    def _base_estimator_predict_proba(self, e, X):
        """Predict label probabilities with the specified estimator on 
        predictor(s) X.
        Parameters
        ----------
        e : int
            The estimator object.
        X : np.ndarray, shape=(n, m)
            The feature data for which to compute the predicted outputs.
        Returns
        -------
        pred : np.ndarray, shape=(len(X), 1)
            The mean of the label probabilities predicted by the specified 
            estimator for each fold for each instance X.
        """
        # Generate array for the base-level testing set, which is n x n_folds.
        pred = e.predict_proba(X)
        assert_all_finite(pred)
        return pred
    def _make_meta(self, X):
        """Make the feature set for the meta (level-1) estimator.
        Parameters
        ----------
        X : np.ndarray, shape=(n, m)
            The feature data.
        Returns
        -------
        An n x len(self.estimators_) array of meta-level features.
        """
        rows = []
        for e in self.estimators_:
            if self.proba_:
                # Predict label probabilities
                pred = self._base_estimator_predict_proba(e, X)
            else:
                # Predict label values
                pred = self._base_estimator_predict(e, X)
            rows.append(pred)
        return np.hstack(rows)
    def fit(self, X, y):
        """Fit the estimator given predictor(s) X and target y.
        Parameters
        ----------
        X : np.ndarray, shape=(n, m)
            The feature data on which to fit.
        y : array of shape = [n_samples]
            The actual outputs (class data).
        """
        # Build meta data.
        X_meta = [] # meta-level features
        y_meta = [] # meta-level labels
        print 'Training and validating the base (level-0) estimator(s)...'
        print
        for i, (a, b) in enumerate(self.cv_):
            print 'Fold [%s]' % (i)
            X_a, X_b = X[a], X[b] # training and validation features
            y_a, y_b = y[a], y[b] # training and validation labels
            # Fit each base estimator using the training set for the fold.
            for j, e in enumerate(self.estimators_):
                print '  Training base (level-0) estimator %d...' % (j),
                e.fit(X_a, y_a)
                print 'done.'
            proba = self._make_meta(X_b)
            X_meta.append(proba)
            y_meta.append(y_b)
        print
        X_meta = np.vstack(X_meta)
        if y_meta[0].ndim == 1:
            y_meta = np.hstack(y_meta)
        else:
            y_meta = np.vstack(y_meta)
        # Train meta estimator.
        print 'Training meta (level-1) estimator...',
        self.meta_estimator_.fit(X_meta, y_meta)
        print 'done.'
        # Re-train base estimators on full data.
        for j, e in enumerate(self.estimators_):
            print 'Re-training base (level-0) estimator %d on full data...' % (j),
            e.fit(X, y)
            print 'done.'
    def predict(self, X):
        """Predict label values with the fitted estimator on predictor(s) X.
        Parameters
        ----------
        X : np.ndarray, shape=(n, m)
            The feature data for which to compute the predicted output.
        Returns
        -------
        array of shape = [n_samples]
            The predicted label values of the input samples.
        """
        X_meta = self._make_meta(X)
        return self.meta_estimator_.predict(X_meta)
    def predict_proba(self, X):
        """Predict label probabilities with the fitted estimator on 
        predictor(s) X.
        Parameters
        ----------
        X : np.ndarray, shape=(n, m)
            The feature data for which to compute the predicted output.
        Returns
        -------
        array of shape = [n_samples]
            The predicted label probabilities of the input samples.
        """
        X_meta = self._make_meta(X)
        return self.meta_estimator_.predict_proba(X_meta)
    
In [ ]:
    
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.4)
    
In [ ]:
    
%%time
clfs = [c1, c2, c3, c4, c5, c10, c11, c20, c21]
# Generate k stratified folds of the training data.
cv = sklearn.cross_validation.StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=123456)
skf = list(cv)  # do you know what this does?  turns an iterator into a list (of lists)... tricky!
stk = Stacking(sklearn.linear_model.LogisticRegression, clfs, skf, stackingc=False, proba=True)
stk.fit(X_train, y_train)
    
In [ ]:
    
sklearn.metrics.accuracy_score(y_test, stk.predict(X_test))
    
How much data did we use? try it with more, 10,000 for each class... or 100,000.
In [129]: