5-fold stacking

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import warnings

In [3]:
# Scikit-Learn 官網作圖函式

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure(figsize=(10,6))  #調整作圖大小
    if ylim is not None:
    plt.xlabel("Training examples")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    return plt

In [4]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None, seed_flag=False):
        params['random_state'] = seed
        if(seed_flag == False):
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    def fit(self,x,y):
        return self.clf.fit(x,y)

    def feature_importances(self,x,y):
        return self.clf.fit(x,y).feature_importances_

In [5]:
#Out-of-Fold Predictions

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf): # kf:KFold(ntrain, n_folds= NFOLDS,...)
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te) # partial index from x_train
        oof_test_skf[i, :] = clf.predict(x_test) # Row(n-Fold), Column(predict value)

    oof_test[:] = np.mean(oof_test_skf, axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) #make sure return n-rows, 1-column shape.


In [6]:
import urllib.request
import os

In [7]:
if not os.path.isfile(filepath):

使用Pandas dataframe讀取資料並進行處理

In [8]:
import numpy
import pandas as pd

In [9]:
all_df = pd.read_excel(filepath)

In [10]:

pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.0000 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON

In [11]:
cols=['survived','name','pclass' ,'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']

In [12]:

survived name pclass sex age sibsp parch fare embarked
0 1 Allen, Miss. Elisabeth Walton 1 female 29.0000 0 0 211.3375 S
1 1 Allison, Master. Hudson Trevor 1 male 0.9167 1 2 151.5500 S

In [13]:

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [14]:
df=all_df.drop(['name'], axis=1)
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)

In [15]:

survived pclass sex age sibsp parch fare embarked
0 1 1 0 29.0000 0 0 211.3375 S
1 1 1 1 0.9167 1 2 151.5500 S

In [16]:
x_OneHot_df = pd.get_dummies(data=df,columns=["embarked" ])

survived pclass sex age sibsp parch fare embarked_C embarked_Q embarked_S
0 1 1 0 29.0000 0 0 211.3375 0 0 1
1 1 1 1 0.9167 1 2 151.5500 0 0 1


In [17]:
from sklearn.model_selection import train_test_split

X = x_OneHot_df[['pclass','sex','age','sibsp','parch','fare','embarked_C','embarked_Q','embarked_S']]
y = x_OneHot_df['survived']
test = X[1200:]

X_train, X_test, y_train, y_test = train_test_split(X[:1200], y[:1200], test_size = 0.3, random_state=100)

In [18]:
len(X), len(y), len(all_df), len(test)

(1309, 1309, 1309, 109)

In [19]:
len(X_train), len(X_test), len(y_train), len(y_test)

(840, 360, 840, 360)

In [20]:
# StandardScaler - subtract the mean and divide by std
# MaxAbsScaler - transform down to [-1, 1] bounds
# QuantileTransformer - transform down to [0 1] bounds

from sklearn.preprocessing import StandardScaler #, MaxAbsScaler, QuantileTransformer
sc = StandardScaler() #MaxAbsScaler()

X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
test_std = sc.transform(test)

Model build

In [21]:
from sklearn.cross_validation import KFold

NFOLDS = 5 # set folds for out-of-fold prediction
SEED = 0 # for reproducibility

ntrain = X_train_std.shape[0] # X.shape[0]
ntest = test_std.shape[0] # test.shape[0]

kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

In [22]:
# Put in our parameters for said classifiers

# Decision Tree
dt_params = {

knn_params = {

# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    'criterion': 'gini',
    'max_depth': 4,
    #'min_samples_leaf': 2,
    'warm_start': True,
    'oob_score': True,
    'verbose': 0

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators': 800,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'verbose': 0

# AdaBoost parameters
ada_params = {
    'n_estimators': 800,
    'learning_rate' : 0.75

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 1.0,
    'probability': True

# Support Vector Classifier parameters 
svcr_params = {
    'kernel' : 'rbf',
    'C' : 1.0,
    'probability': True

# Bagging Classifier
bag_params = {
    'n_estimators' : 500,
    'oob_score': True

#XGBoost Classifier
xgbc_params = {
    'n_estimators': 500,
    'max_depth': 4,
    'learning_rate': 0.05,
    'nthread': -1

#Linear Discriminant Analysis
lda_params = {}

#Quadratic Discriminant Analysis
qda1_params = {
    'reg_param': 0.8,
    'tol': 0.00001

#Quadratic Discriminant Analysis
qda2_params = {
    'reg_param': 0.6,
    'tol': 0.0001

In [23]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

dt = SklearnHelper(clf=DecisionTreeClassifier, seed=SEED, params=dt_params, seed_flag=True)
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params, seed_flag=True)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params, seed_flag=True)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params, seed_flag=True)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params, seed_flag=True)

#knn = SklearnHelper(clf=KNeighborsClassifier, seed=SEED, params=knn_params)
#svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params, seed_flag=True)
#svcr = SklearnHelper(clf=SVC, seed=SEED, params=svcr_params, seed_flag=True)
#bag = SklearnHelper(clf=BaggingClassifier, seed=SEED, params=bag_params, seed_flag=True)
#xgbc = SklearnHelper(clf=XGBClassifier, seed=SEED, params=xgbc_params)
#lda = SklearnHelper(clf=LinearDiscriminantAnalysis, seed=SEED, params=lda_params)
#qda1 = SklearnHelper(clf=QuadraticDiscriminantAnalysis, seed=SEED, params=qda1_params)
#qda2 = SklearnHelper(clf=QuadraticDiscriminantAnalysis, seed=SEED, params=qda2_params)

In [24]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = y_train.ravel() 

#x_train = X.values # Creates an array of the train data
#x_test = test.values # Creats an array of the test data

#STD dataset:
x_train = X_train_std
x_test = test_std

In [25]:
# Create our OOF train and test predictions. These base results will be used as new features
dt_oof_train, dt_oof_test = get_oof(dt, x_train, y_train, x_test) # Decision Tree
rf_oof_train, rf_oof_test = get_oof(rf, x_train, y_train, x_test) # Random Forest
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb, x_train, y_train, x_test) # Gradient Boost

#knn_oof_train, knn_oof_test = get_oof(knn, x_train, y_train, x_test) # KNeighbors
#svc_oof_train, svc_oof_test = get_oof(svc, x_train, y_train, x_test) # SVM-l
#svcr_oof_train, svcr_oof_test = get_oof(svcr, x_train, y_train, x_test) # SVM-r
#bag_oof_train, bag_oof_test = get_oof(bag, x_train, y_train, x_test) # Bagging
#xgbc_oof_train, xgbc_oof_test = get_oof(xgbc, x_train, y_train, x_test) # XGBoost
#lda_oof_train, lda_oof_test = get_oof(lda, x_train, y_train, x_test) # Linear Discriminant Analysis
#qda1_oof_train, qda1_oof_test = get_oof(qda1, x_train, y_train, x_test) # Quadratic Discriminant Analysis
#qda2_oof_train, qda2_oof_test = get_oof(qda2, x_train, y_train, x_test) # Quadratic Discriminant Analysis

In [26]:
dt_features = dt.feature_importances(x_train,y_train)
rf_features = rf.feature_importances(x_train,y_train)
et_features = et.feature_importances(x_train, y_train)
ada_features = ada.feature_importances(x_train, y_train)
gb_features = gb.feature_importances(x_train,y_train)

[ 0.17911163  0.58069088  0.06811822  0.05558256  0.00189047  0.10959236
  0.00250524  0.          0.00250863]
[ 0.09551822  0.52275789  0.08480984  0.06164634  0.04406861  0.16157367
  0.01706225  0.0031498   0.00941338]
[ 0.15959402  0.65097313  0.03298211  0.03246773  0.0296189   0.04114259
  0.02961184  0.00493583  0.01867385]
[ 0.0175   0.01375  0.22375  0.01375  0.0275   0.69     0.01125  0.0025   0.     ]
[ 0.02887271  0.04132959  0.38014892  0.0306628   0.01713283  0.46225506
  0.00799253  0.01365403  0.01795153]

In [27]:
cols = X.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
     'Decision Tree': dt_features,
     'Random Forest': rf_features,
     'Extra Trees': et_features,
     'AdaBoost': ada_features,
     'Gradient Boost': gb_features

In [28]:
# Create the new column containing the average of values
feature_dataframe['mean'] = np.mean(feature_dataframe, axis= 1) # axis = 1 computes the mean row-wise

AdaBoost Decision Tree Extra Trees Gradient Boost Random Forest features mean
0 0.01750 0.179112 0.159594 0.028873 0.095518 pclass 0.096119
1 0.01375 0.580691 0.650973 0.041330 0.522758 sex 0.361900
2 0.22375 0.068118 0.032982 0.380149 0.084810 age 0.157962
3 0.01375 0.055583 0.032468 0.030663 0.061646 sibsp 0.038822
4 0.02750 0.001890 0.029619 0.017133 0.044069 parch 0.024042
5 0.69000 0.109592 0.041143 0.462255 0.161574 fare 0.292913
6 0.01125 0.002505 0.029612 0.007993 0.017062 embarked_C 0.013684
7 0.00250 0.000000 0.004936 0.013654 0.003150 embarked_Q 0.004848
8 0.00000 0.002509 0.018674 0.017952 0.009413 embarked_S 0.009709

First-Level Summary

In [29]:
#First-level output as new features
base_predictions_train = pd.DataFrame({
    'DecisionTree': dt_oof_train.ravel(),
    'RandomForest': rf_oof_train.ravel(),
    'ExtraTrees': et_oof_train.ravel(),
    'AdaBoost': ada_oof_train.ravel(),
    'GradientBoost': gb_oof_train.ravel(),
    'type': y_train

AdaBoost DecisionTree ExtraTrees GradientBoost RandomForest type
0 1.0 1.0 1.0 1.0 1.0 0
1 0.0 0.0 0.0 0.0 0.0 0
2 1.0 1.0 1.0 0.0 1.0 1
3 0.0 1.0 1.0 1.0 1.0 1
4 1.0 1.0 1.0 1.0 1.0 1

In [30]:
x_train = np.concatenate(( dt_oof_train, 
                         ), axis=1)
x_test = np.concatenate(( dt_oof_test,
                        ), axis=1)

Second Level Summary

In [31]:
#Second level learning model
import xgboost as xgb

l2_gbm = xgb.XGBClassifier(
         learning_rate = 0.05,
         n_estimators= 2000,
         max_depth= 4,
         objective= 'binary:logistic',
         nthread= -1
).fit(x_train, y_train)

In [32]:
#level-2 CV: x_train, y_train

from sklearn import metrics
print(metrics.classification_report(y_train, l2_gbm.predict(x_train)))

from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=None, shuffle=True)
estimator = l2_gbm
#plot_learning_curve(estimator, "level2 - XGBoost", x_train, y_train, cv=cv, train_sizes=np.linspace(0.2, 1.0, 8))

             precision    recall  f1-score   support

          0       0.83      0.89      0.86       502
          1       0.81      0.73      0.77       338

avg / total       0.82      0.82      0.82       840

In [33]:
#level2 - XGB
l2_gbm_pred = l2_gbm.predict(x_test)
metrics.precision_recall_fscore_support(y_train, l2_gbm.predict(x_train), average='weighted')

[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0
 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]