blending


In [25]:
from __future__ import division
from IPython.display import display
from matplotlib import pyplot as plt
%matplotlib inline

import numpy  as np
import pandas as pd
import random, sys, os, re

from sklearn.ensemble         import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model     import LogisticRegression

from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search      import RandomizedSearchCV, GridSearchCV
from sklearn.cross_validation import cross_val_predict, permutation_test_score

In [26]:
SEED   = 97
scale  = False 
minmax = False
norm   = False
nointercept = True
engineering = True

N_CLASSES = 2

submission_filename = "../submissions/submission_blending_ensemble.csv"

Load the training data


In [27]:
from load_blood_data import load_blood_data

y_train, X_train = load_blood_data(train=True, SEED   = SEED, 
                                               scale  = scale,
                                               minmax = minmax,
                                               norm   = norm,
                                               nointercept = nointercept,
                                               engineering = engineering)

Load the test data


In [28]:
from load_blood_data import load_blood_data

X_test, IDs = load_blood_data(train=False, SEED   = SEED, 
                                           scale  = scale,
                                           minmax = minmax,
                                           norm   = norm,
                                           nointercept = nointercept,
                                           engineering = engineering)

Fit the model


In [29]:
StatifiedCV = StratifiedKFold(y            = y_train, 
                              n_folds      = 10, 
                              shuffle      = True, 
                              random_state = SEED)

In [30]:
%%time

random.seed(SEED)

X_train = X_train.values.astype(np.float32)
X_test  = X_test.values.astype(np.float32)

skf = list(StatifiedCV)

# popular non-linear choices
# GBM, RF, XT + KNN, NN 

clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100,   n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=10,    n_jobs=-1, criterion='entropy', max_depth=7, max_features=None),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.50, max_depth=6, n_estimators=50),
        GradientBoostingClassifier(learning_rate=0.15, subsample=0.75, max_depth=1, n_estimators=175,
                                   loss='exponential')
       ]

print "Creating train and test sets for blending."

dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test  = np.zeros((X_test.shape[0],  len(clfs)))

for j, clf in enumerate(clfs):
    print("\n {}, {}".format(j, clf))
    dataset_blend_test_j = np.zeros((X_test.shape[0], len(skf)))
    
    for i, (train, test) in enumerate(skf):
        print "Fold", i
        X_b_train = X_train[train]
        y_b_train = y_train[train]
        X_b_test  = X_train[test]
        y_b_test  = y_train[test]
        
        clf.fit(X_b_train, y_b_train)
        y_submission = clf.predict_proba(X_b_test)[:,1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i]   = clf.predict_proba(X_test)[:,1]
        
    dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)


Creating train and test sets for blending.

 0, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

 1, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

 2, ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

 3, ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=7, max_features=None, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

 4, GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=6, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=0.5, verbose=0,
              warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

 5, GradientBoostingClassifier(init=None, learning_rate=0.15, loss='exponential',
              max_depth=1, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=175,
              presort='auto', random_state=None, subsample=0.75, verbose=0,
              warm_start=False)
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
CPU times: user 9.27 s, sys: 1.3 s, total: 10.6 s
Wall time: 19.8 s

In [31]:
%%time

print "Blending."
clf = LogisticRegression()
clf.fit(dataset_blend_train, y_train)
y_submission = clf.predict_proba(dataset_blend_test)[:,1]

print "Linear stretch of predictions to [0,1]\n"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())


Blending.
Linear stretch of predictions to [0,1]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.14 ms

Save the submission file


In [32]:
y_pred_probs  = y_submission
print(y_pred_probs[:10])
donate_probs  = [prob for prob in y_pred_probs]

print "Saving Results."

f = open(submission_filename, "w")

f.write(",Made Donation in March 2007\n")
for ID, prob in zip(IDs, donate_probs):
    f.write("{},{}\n".format(ID,prob))
    
f.close()


[ 0.70195683  0.17863846  0.09775343  0.26805966  0.78357102  1.
  0.33982939  0.02038799  0.03125441  0.01466595]
Saving Results.

In [ ]:


In [ ]: