My thanks to
In [25]:
from __future__ import division
from IPython.display import display
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import random, sys, os, re
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.cross_validation import cross_val_predict, permutation_test_score
In [26]:
SEED = 97
scale = False
minmax = False
norm = False
nointercept = True
engineering = True
N_CLASSES = 2
submission_filename = "../submissions/submission_blending_ensemble.csv"
In [27]:
from load_blood_data import load_blood_data
y_train, X_train = load_blood_data(train=True, SEED = SEED,
scale = scale,
minmax = minmax,
norm = norm,
nointercept = nointercept,
engineering = engineering)
In [28]:
from load_blood_data import load_blood_data
X_test, IDs = load_blood_data(train=False, SEED = SEED,
scale = scale,
minmax = minmax,
norm = norm,
nointercept = nointercept,
engineering = engineering)
In [29]:
StatifiedCV = StratifiedKFold(y = y_train,
n_folds = 10,
shuffle = True,
random_state = SEED)
In [30]:
%%time
random.seed(SEED)
X_train = X_train.values.astype(np.float32)
X_test = X_test.values.astype(np.float32)
skf = list(StatifiedCV)
# popular non-linear choices
# GBM, RF, XT + KNN, NN
clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy', max_depth=7, max_features=None),
GradientBoostingClassifier(learning_rate=0.05, subsample=0.50, max_depth=6, n_estimators=50),
GradientBoostingClassifier(learning_rate=0.15, subsample=0.75, max_depth=1, n_estimators=175,
loss='exponential')
]
print "Creating train and test sets for blending."
dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))
for j, clf in enumerate(clfs):
print("\n {}, {}".format(j, clf))
dataset_blend_test_j = np.zeros((X_test.shape[0], len(skf)))
for i, (train, test) in enumerate(skf):
print "Fold", i
X_b_train = X_train[train]
y_b_train = y_train[train]
X_b_test = X_train[test]
y_b_test = y_train[test]
clf.fit(X_b_train, y_b_train)
y_submission = clf.predict_proba(X_b_test)[:,1]
dataset_blend_train[test, j] = y_submission
dataset_blend_test_j[:, i] = clf.predict_proba(X_test)[:,1]
dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
In [31]:
%%time
print "Blending."
clf = LogisticRegression()
clf.fit(dataset_blend_train, y_train)
y_submission = clf.predict_proba(dataset_blend_test)[:,1]
print "Linear stretch of predictions to [0,1]\n"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
In [32]:
y_pred_probs = y_submission
print(y_pred_probs[:10])
donate_probs = [prob for prob in y_pred_probs]
print "Saving Results."
f = open(submission_filename, "w")
f.write(",Made Donation in March 2007\n")
for ID, prob in zip(IDs, donate_probs):
f.write("{},{}\n".format(ID,prob))
f.close()
In [ ]:
In [ ]: