In [1]:
from __future__ import division
from IPython.display import display
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import random, sys, os, re
from sknn.mlp import Classifier, Layer
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.cross_validation import cross_val_predict, permutation_test_score
In [23]:
SEED = 97
scale = True
minmax = False
norm = False
nointercept = True
engineering = True
N_CLASSES = 2
max_epochs = 100
submission_filename = "../submissions/submission_bagged_scikit_nn.csv"
In [3]:
from load_blood_data import load_blood_data
y_train, X_train = load_blood_data(train=True, SEED = SEED,
scale = scale,
minmax = minmax,
norm = norm,
nointercept = nointercept,
engineering = engineering)
In [4]:
StatifiedCV = StratifiedKFold(y = y_train,
n_folds = 10,
shuffle = True,
random_state = SEED)
In [31]:
"""
["Bagging Predictors" Department of Statistics UC Berkeley 1994; Leo Breiman]
"""
from sklearn.base import BaseEstimator, ClassifierMixin, clone
class BaggedScikitClassifier(BaseEstimator, ClassifierMixin):
"""
Runs a scikit-learn base estimator 'n_estimators' times, fitting with
the X_train indices drawn from a random sample of size X_train.shape[0]
with replacement
sklearn.ensemble.BaggingClassifier is more convenient and has more tunable
features but it requires the base estimator to have a 'fit' method with a
'sample_weight' parameter and many estimators do not
(as of sklearn.__version__ == '0.17').
Usage
-----
clf = BaggedScikitClassifier(base_estimator, n_estimators)
base_estimator is an instance of a scikit-learn object
n_estimators is the number of bagged replicas to use
clf.fit(X_train, y_train)
a no-op; merely stores X_train and y_train for use in the
predict and predict_proba methods
clf.predict(X_test)
the value returned is the majority vote of each of the
fitted base_estimators' class predictions
clf.predict_proba(X_test)
the value returned is the average of each of the
fitted base_estimators' probabilities
"""
def __init__(self, base_estimator, n_estimators=15):
self.n_estimators = n_estimators
self.base_estimator = base_estimator
def fit(self, X_train, y_train):
"""
The actual fitting is done in the predict & predict_proba methods
"""
self.X_train = X_train
self.y_train = y_train
return self
def predict(self, X_test):
"""
Return the majority vote of 'n_estimators' fits
"""
predictions = []
for i in range(self.n_estimators):
clf = clone(self.base_estimator)
idx = np.random.choice(a = range(self.X_train.shape[0]),
size = self.X_train.shape[0],
replace=True, p=None)
clf.fit(self.X_train[idx,:], self.y_train[idx])
predictions.append(clf.predict(X_test))
from scipy.stats import mode
return mode(predictions)[0][0]
def predict_proba(self, X_test):
"""
Return the average probability matrix of 'n_estimators' fits
"""
completed_bags = 0
predictions = np.empty((X_test.shape[0], N_CLASSES), dtype=np.float32)
for i in range(self.n_estimators):
clf = clone(self.base_estimator)
idx = np.random.choice(a = range(self.X_train.shape[0]),
size = self.X_train.shape[0],
replace=True, p=None)
clf.fit(self.X_train[idx,:], self.y_train[idx])
predictions += clf.predict_proba(X_test)
completed_bags += 1
return predictions / float(completed_bags)
In [32]:
%%time
random.seed(SEED)
# ----------------------------------- base estimator -------------------------------
nn_layers = [
Layer(type = 'Rectifier', name = 'hidden',
units = 100,
weight_decay = None,
pieces = None,
dropout = None),
Layer(type = 'Softmax', name = 'output')
]
base = Classifier(layers = nn_layers,
learning_rate = 0.01,
learning_rule = 'nesterov',
learning_momentum = 0.9,
loss_type = u'mse',
mutator = None, # data augmentation function
regularize = None,
weight_decay = None,
dropout_rate = None,
batch_size = 10,
valid_size = None,
valid_set = None,
n_stable = 10, # early stopping after ...
f_stable = 0.001, # validation error change threshold
n_iter = max_epochs,
random_state = SEED,
debug = False,
verbose = True)
# ----------------------------------- BaggedScikitClassifier -------------------------------
clf = BaggedScikitClassifier(base_estimator = base,
n_estimators = 15)
In [7]:
# from sklearn_utilities import GridSearchHeatmap
# GridSearchHeatmap(grid_clf, y_key='learning_rate', x_key='n_estimators')
# from sklearn_utilities import plot_validation_curves
# plot_validation_curves(grid_clf, param_grid, X_train, y_train, ylim = (0.0, 1.05))
In [8]:
%%time
try:
from sklearn_utilities import plot_learning_curve
except:
import imp, os
util = imp.load_source('sklearn_utilities', os.path.expanduser('~/Dropbox/Python/sklearn_utilities.py'))
from sklearn_utilities import plot_learning_curve
plot_learning_curve(estimator = clf,
title = 'Learning Curves',
X = X_train.values.astype(np.float32),
y = y_train,
ylim = (0.0, 1.10),
cv = StatifiedCV,
train_sizes = np.linspace(.1, 1.0, 5),
n_jobs = -1)
plt.show()
In [9]:
%%time
train_preds = cross_val_predict(estimator = clf,
X = X_train.values.astype(np.float32),
y = y_train,
cv = StatifiedCV,
n_jobs = -1,
verbose = 0,
fit_params = None,
pre_dispatch = '2*n_jobs')
y_true, y_pred = y_train, train_preds
In [10]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred, labels=None)
print cm
try:
from sklearn_utilities import plot_confusion_matrix
except:
import imp, os
util = imp.load_source('sklearn_utilities', os.path.expanduser('~/Dropbox/Python/sklearn_utilities.py'))
from sklearn_utilities import plot_confusion_matrix
plot_confusion_matrix(cm, ['Did not Donate','Donated'])
accuracy = round(np.trace(cm)/float(np.sum(cm)),4)
misclass = 1 - accuracy
print("Accuracy {}, mis-class rate {}".format(accuracy,misclass))
In [11]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=None)
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
AUC = roc_auc_score(y_true, y_pred, average='macro')
plt.text(x=0.6,y=0.4,s="AUC {:.4f}"\
.format(AUC),
fontsize=16)
plt.text(x=0.6,y=0.3,s="accuracy {:.2f}%"\
.format(accuracy*100),
fontsize=16)
logloss = log_loss(y_true, y_pred)
plt.text(x=0.6,y=0.2,s="LogLoss {:.4f}"\
.format(logloss),
fontsize=16)
f1 = f1_score(y_true, y_pred)
plt.text(x=0.6,y=0.1,s="f1 {:.4f}"\
.format(f1),
fontsize=16)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()
In [12]:
%%time
# score : The true score without permuting targets.
# permutation_scores : array, shape (n_permutations,)
# The scores obtained for each permutation.
# pvalue : The returned value equals
# > p-value if scoring returns bigger numbers for better scores (e.g., accuracy_score).
# > If scoring is a loss function (i.e. when lower is better such as with mean_squared_error)
# then this is actually the complement of the p-value: 1 - p-value.
score, permutation_scores, pvalue = permutation_test_score(estimator = clf,
X = X_train.values.astype(np.float32),
y = y_train,
cv = StatifiedCV,
labels = None,
random_state = SEED,
verbose = 0,
n_permutations = 100,
scoring = None,
n_jobs = -1)
# find mean and stdev of the scores
from scipy.stats import norm
mu, std = norm.fit(permutation_scores)
plt.figure(figsize=(20,8))
plt.hist(permutation_scores, 20, label='Permutation scores')
ylim = plt.ylim()
plt.plot(2 * [score], ylim, '--g', linewidth=3,
label='Classification Score (pvalue {:.4f})'.format(pvalue))
plt.plot(2 * [1. / N_CLASSES], ylim, 'r', linewidth=7, label='Luck')
plt.ylim(ylim)
plt.title('mu={:.4f}, std={:.4f}'.format(mu,std), fontsize=20)
plt.legend(loc='center',fontsize=16)
plt.xlabel('Score')
plt.show()
In [13]:
# format for scores.csv file
import re
algo = re.search(r"submission_(.*?)\.csv", submission_filename).group(1)
print("{: <26} , , {:.4f} , {:.4f} , {:.4f} , {:.4f} , {:.4f} , {:.4f}"\
.format(algo,accuracy,logloss,AUC,f1,mu,std))
In [14]:
# load the R extension
%load_ext rpy2.ipython
# see http://ipython.readthedocs.org/en/stable/config/extensions/index.html?highlight=rmagic
# see http://rpy.sourceforge.net/rpy2/doc-2.4/html/interactive.html#module-rpy2.ipython.rmagic
In [15]:
# Import python variables into R
%R -i accuracy,logloss,AUC,f1,mu,std
In [16]:
%%R
# read in the scores.csv file and perform a linear regression with it using this process's variables
score_data = read.csv('../input/scores.csv')
lm.fit = lm(leaderboard_score ~ accuracy + logloss + AUC + f1 + mu + std,
data = score_data,
na.action = na.omit)
slm.fit = step(lm.fit, direction = "both", trace=0)
predicted_leaderboard_score = predict(object = slm.fit,
newdata = data.frame(accuracy,logloss,AUC,f1,mu,std),
interval = "prediction", level = 0.99)
print(round(predicted_leaderboard_score,4))
In [33]:
#clf.set_params(**clf_params)
clf.fit(X_train.values.astype(np.float32), y_train)
Out[33]:
In [34]:
from load_blood_data import load_blood_data
X_test, IDs = load_blood_data(train=False, SEED = SEED,
scale = scale,
minmax = minmax,
norm = norm,
nointercept = nointercept,
engineering = engineering)
In [35]:
y_pred = clf.predict(X_test.values.astype(np.float32)).ravel()
print(y_pred[:10])
try:
y_pred_probs = clf.predict_proba(X_test.values.astype(np.float32))
print(y_pred_probs[:10])
donate_probs = [prob[1] for prob in y_pred_probs]
except Exception,e:
print(e)
donate_probs = [0.65 if x>0 else 1-0.65 for x in y_pred]
print(donate_probs[:10])
In [37]:
assert len(IDs)==len(donate_probs)
f = open(submission_filename, "w")
f.write(",Made Donation in March 2007\n")
for ID, prob in zip(IDs, donate_probs):
f.write("{},{}\n".format(ID,prob))
f.close()
In [ ]: