5-fold stacking


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Scikit-Learn 官網作圖函式
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure(figsize=(10,6))  #調整作圖大小
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


Automatically created module for IPython interactive environment

In [4]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None, seed_flag=False):
        params['random_state'] = seed
        
        if(seed_flag == False):
            params.pop('random_state')
            
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)

    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
        return self.clf.fit(x,y).feature_importances_

In [5]:
#Out-of-Fold Predictions

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf): # kf:KFold(ntrain, n_folds= NFOLDS,...)
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te) # partial index from x_train
        oof_test_skf[i, :] = clf.predict(x_test) # Row(n-Fold), Column(predict value)

    oof_test[:] = np.mean(oof_test_skf, axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) #make sure return n-rows, 1-column shape.

下載鐵達尼號旅客資料集


In [6]:
import urllib.request
import os

In [7]:
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="data/titanic3.xls"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

使用Pandas dataframe讀取資料並進行處理


In [8]:
import numpy
import pandas as pd

In [9]:
all_df = pd.read_excel(filepath)

In [10]:
all_df[:2]


Out[10]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.0000 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON

In [11]:
cols=['survived','name','pclass' ,'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df=all_df[cols]

In [12]:
all_df[:2]


Out[12]:
survived name pclass sex age sibsp parch fare embarked
0 1 Allen, Miss. Elisabeth Walton 1 female 29.0000 0 0 211.3375 S
1 1 Allison, Master. Hudson Trevor 1 male 0.9167 1 2 151.5500 S

In [13]:
all_df.isnull().sum()


Out[13]:
survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [14]:
df=all_df.drop(['name'], axis=1)
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)

In [15]:
df[:2]


Out[15]:
survived pclass sex age sibsp parch fare embarked
0 1 1 0 29.0000 0 0 211.3375 S
1 1 1 1 0.9167 1 2 151.5500 S

In [16]:
x_OneHot_df = pd.get_dummies(data=df,columns=["embarked" ])
x_OneHot_df[:2]


Out[16]:
survived pclass sex age sibsp parch fare embarked_C embarked_Q embarked_S
0 1 1 0 29.0000 0 0 211.3375 0 0 1
1 1 1 1 0.9167 1 2 151.5500 0 0 1

將資料分為訓練資料與測試資料


In [17]:
from sklearn.model_selection import train_test_split

X = x_OneHot_df[['pclass','sex','age','sibsp','parch','fare','embarked_C','embarked_Q','embarked_S']]
y = x_OneHot_df['survived']
test = X[1200:]

X_train, X_test, y_train, y_test = train_test_split(X[:1200], y[:1200], test_size = 0.3, random_state=100)

In [18]:
len(X), len(y), len(all_df), len(test)


Out[18]:
(1309, 1309, 1309, 109)

In [19]:
len(X_train), len(X_test), len(y_train), len(y_test)


Out[19]:
(840, 360, 840, 360)

In [20]:
# StandardScaler - subtract the mean and divide by std
# MaxAbsScaler - transform down to [-1, 1] bounds
# QuantileTransformer - transform down to [0 1] bounds

from sklearn.preprocessing import StandardScaler #, MaxAbsScaler, QuantileTransformer
sc = StandardScaler() #MaxAbsScaler()
sc.fit(X)

X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
test_std = sc.transform(test)

Model build


In [21]:
from sklearn.cross_validation import KFold

NFOLDS = 5 # set folds for out-of-fold prediction
SEED = 0 # for reproducibility

ntrain = X_train_std.shape[0] # X.shape[0]
ntest = test_std.shape[0] # test.shape[0]

kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)


C:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [22]:
# Put in our parameters for said classifiers

# Decision Tree
dt_params = {
    'criterion':'gini',
    'max_depth':5
}

# KNN
knn_params = {
    'n_neighbors':5
}

# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    'criterion': 'gini',
    'max_depth': 4,
    #'min_samples_leaf': 2,
    'warm_start': True,
    'oob_score': True,
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators': 800,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 800,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 1.0,
    'probability': True
}

# Support Vector Classifier parameters 
svcr_params = {
    'kernel' : 'rbf',
    'C' : 1.0,
    'probability': True
}

# Bagging Classifier
bag_params = {
    'n_estimators' : 500,
    'oob_score': True
}

#XGBoost Classifier
xgbc_params = {
    'n_estimators': 500,
    'max_depth': 4,
    'learning_rate': 0.05,
    'nthread': -1
}

#Linear Discriminant Analysis
lda_params = {}

#Quadratic Discriminant Analysis
qda1_params = {
    'reg_param': 0.8,
    'tol': 0.00001
}

#Quadratic Discriminant Analysis
qda2_params = {
    'reg_param': 0.6,
    'tol': 0.0001
}

In [23]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

dt = SklearnHelper(clf=DecisionTreeClassifier, seed=SEED, params=dt_params, seed_flag=True)
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params, seed_flag=True)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params, seed_flag=True)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params, seed_flag=True)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params, seed_flag=True)

#knn = SklearnHelper(clf=KNeighborsClassifier, seed=SEED, params=knn_params)
#svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params, seed_flag=True)
#svcr = SklearnHelper(clf=SVC, seed=SEED, params=svcr_params, seed_flag=True)
#bag = SklearnHelper(clf=BaggingClassifier, seed=SEED, params=bag_params, seed_flag=True)
#xgbc = SklearnHelper(clf=XGBClassifier, seed=SEED, params=xgbc_params)
#lda = SklearnHelper(clf=LinearDiscriminantAnalysis, seed=SEED, params=lda_params)
#qda1 = SklearnHelper(clf=QuadraticDiscriminantAnalysis, seed=SEED, params=qda1_params)
#qda2 = SklearnHelper(clf=QuadraticDiscriminantAnalysis, seed=SEED, params=qda2_params)

In [24]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = y_train.ravel() 

#y.ravel()
#x_train = X.values # Creates an array of the train data
#x_test = test.values # Creats an array of the test data

#STD dataset:
x_train = X_train_std
x_test = test_std

In [25]:
# Create our OOF train and test predictions. These base results will be used as new features
dt_oof_train, dt_oof_test = get_oof(dt, x_train, y_train, x_test) # Decision Tree
rf_oof_train, rf_oof_test = get_oof(rf, x_train, y_train, x_test) # Random Forest
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb, x_train, y_train, x_test) # Gradient Boost

#knn_oof_train, knn_oof_test = get_oof(knn, x_train, y_train, x_test) # KNeighbors
#svc_oof_train, svc_oof_test = get_oof(svc, x_train, y_train, x_test) # SVM-l
#svcr_oof_train, svcr_oof_test = get_oof(svcr, x_train, y_train, x_test) # SVM-r
#bag_oof_train, bag_oof_test = get_oof(bag, x_train, y_train, x_test) # Bagging
#xgbc_oof_train, xgbc_oof_test = get_oof(xgbc, x_train, y_train, x_test) # XGBoost
#lda_oof_train, lda_oof_test = get_oof(lda, x_train, y_train, x_test) # Linear Discriminant Analysis
#qda1_oof_train, qda1_oof_test = get_oof(qda1, x_train, y_train, x_test) # Quadratic Discriminant Analysis
#qda2_oof_train, qda2_oof_test = get_oof(qda2, x_train, y_train, x_test) # Quadratic Discriminant Analysis

In [26]:
dt_features = dt.feature_importances(x_train,y_train)
rf_features = rf.feature_importances(x_train,y_train)
et_features = et.feature_importances(x_train, y_train)
ada_features = ada.feature_importances(x_train, y_train)
gb_features = gb.feature_importances(x_train,y_train)


[ 0.17911163  0.58069088  0.06811822  0.05558256  0.00189047  0.10959236
  0.00250524  0.          0.00250863]
[ 0.09551822  0.52275789  0.08480984  0.06164634  0.04406861  0.16157367
  0.01706225  0.0031498   0.00941338]
[ 0.15959402  0.65097313  0.03298211  0.03246773  0.0296189   0.04114259
  0.02961184  0.00493583  0.01867385]
[ 0.0175   0.01375  0.22375  0.01375  0.0275   0.69     0.01125  0.0025   0.     ]
[ 0.02887271  0.04132959  0.38014892  0.0306628   0.01713283  0.46225506
  0.00799253  0.01365403  0.01795153]

In [27]:
cols = X.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
     'Decision Tree': dt_features,
     'Random Forest': rf_features,
     'Extra Trees': et_features,
     'AdaBoost': ada_features,
     'Gradient Boost': gb_features
    })

In [28]:
# Create the new column containing the average of values
feature_dataframe['mean'] = np.mean(feature_dataframe, axis= 1) # axis = 1 computes the mean row-wise
feature_dataframe


Out[28]:
AdaBoost Decision Tree Extra Trees Gradient Boost Random Forest features mean
0 0.01750 0.179112 0.159594 0.028873 0.095518 pclass 0.096119
1 0.01375 0.580691 0.650973 0.041330 0.522758 sex 0.361900
2 0.22375 0.068118 0.032982 0.380149 0.084810 age 0.157962
3 0.01375 0.055583 0.032468 0.030663 0.061646 sibsp 0.038822
4 0.02750 0.001890 0.029619 0.017133 0.044069 parch 0.024042
5 0.69000 0.109592 0.041143 0.462255 0.161574 fare 0.292913
6 0.01125 0.002505 0.029612 0.007993 0.017062 embarked_C 0.013684
7 0.00250 0.000000 0.004936 0.013654 0.003150 embarked_Q 0.004848
8 0.00000 0.002509 0.018674 0.017952 0.009413 embarked_S 0.009709

First-Level Summary


In [29]:
#First-level output as new features
base_predictions_train = pd.DataFrame({
    'DecisionTree': dt_oof_train.ravel(),
    'RandomForest': rf_oof_train.ravel(),
    'ExtraTrees': et_oof_train.ravel(),
    'AdaBoost': ada_oof_train.ravel(),
    'GradientBoost': gb_oof_train.ravel(),
    'type': y_train
    })
base_predictions_train.head()


Out[29]:
AdaBoost DecisionTree ExtraTrees GradientBoost RandomForest type
0 1.0 1.0 1.0 1.0 1.0 0
1 0.0 0.0 0.0 0.0 0.0 0
2 1.0 1.0 1.0 0.0 1.0 1
3 0.0 1.0 1.0 1.0 1.0 1
4 1.0 1.0 1.0 1.0 1.0 1

In [30]:
x_train = np.concatenate(( dt_oof_train, 
                           rf_oof_train,
                           et_oof_train, 
                           ada_oof_train, 
                           gb_oof_train
                         ), axis=1)
x_test = np.concatenate(( dt_oof_test,
                          rf_oof_test,
                          et_oof_test,
                          ada_oof_test, 
                          gb_oof_test
                        ), axis=1)

Second Level Summary


In [31]:
#Second level learning model
import xgboost as xgb

l2_gbm = xgb.XGBClassifier(
         learning_rate = 0.05,
         n_estimators= 2000,
         max_depth= 4,
         gamma=0.9,
         subsample=0.8,
         colsample_bytree=0.8,
         objective= 'binary:logistic',
         nthread= -1
).fit(x_train, y_train)

In [32]:
#level-2 CV: x_train, y_train

from sklearn import metrics
print(metrics.classification_report(y_train, l2_gbm.predict(x_train)))

from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=None, shuffle=True)
estimator = l2_gbm
#plot_learning_curve(estimator, "level2 - XGBoost", x_train, y_train, cv=cv, train_sizes=np.linspace(0.2, 1.0, 8))


             precision    recall  f1-score   support

          0       0.83      0.89      0.86       502
          1       0.81      0.73      0.77       338

avg / total       0.82      0.82      0.82       840


In [33]:
#level2 - XGB
l2_gbm_pred = l2_gbm.predict(x_test)
metrics.precision_recall_fscore_support(y_train, l2_gbm.predict(x_train), average='weighted')
print(l2_gbm_pred)


[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0
 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]