bagged_nolearn


In [1]:
from __future__      import division
from IPython.display import display
from matplotlib      import pyplot as plt
%matplotlib inline

import numpy  as np
import pandas as pd
import random, sys, os, re

from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search      import RandomizedSearchCV, GridSearchCV
from sklearn.cross_validation import cross_val_predict, permutation_test_score

In [2]:
SEED   = 1091
scale  = True 
minmax = False
norm   = False
nointercept = True
engineering = False

max_epochs = 100
N_CLASSES  = 2

submission_filename = "../submissions/submission_bagged_nolearnSEED1091.csv"

In [3]:
global early_stop_type
early_stop_type = 'unset'

In [4]:
import theano
from theano import tensor as T

import lasagne
from lasagne                import layers, nonlinearities
from lasagne.nonlinearities import rectify, softmax, sigmoid
from lasagne.updates        import nesterov_momentum, adam, adagrad, rmsprop
from lasagne.layers         import InputLayer, DenseLayer, DropoutLayer
from lasagne.layers         import get_all_params

try:
    from lasagne.layers.cuda_covnet import Conv2DCCLayer    as Conv2DLayer
    from lasagne.layers.cuda_covnet import MaxPool2DCCLayer as MaxPool2DLayer
    from lasagne.layers             import Pool2DLayer
    print("Conv2DCCLayer and MaxPool2DCCLayer used")
    DCC = True
except ImportError:
    print("ImportError: Conv2DLayer and MaxPool2DLayer used")
    DCC = False
    Conv2DLayer    = lasagne.layers.Conv2DLayer
    MaxPool2DLayer = lasagne.layers.MaxPool2DLayer
    Pool2DLayer    = lasagne.layers.Pool2DLayer


ImportError: Conv2DLayer and MaxPool2DLayer used

In [5]:
from nolearn.lasagne           import NeuralNet, TrainSplit, BatchIterator
from nolearn.lasagne           import objective
from nolearn.lasagne           import PrintLayerInfo

from nolearn.lasagne.visualize import plot_loss
from nolearn.lasagne.visualize import plot_conv_weights
from nolearn.lasagne.visualize import plot_conv_activity
from nolearn.lasagne.visualize import plot_occlusion

In [6]:
print theano.config.device


cpu

Load the training data


In [7]:
from load_blood_data import load_blood_data

y_train, X_train = load_blood_data(train=True, SEED   = SEED, 
                                               scale  = scale,
                                               minmax = minmax,
                                               norm   = norm,
                                               nointercept = nointercept,
                                               engineering = engineering)

Fit the model


In [8]:
StatifiedCV = StratifiedKFold(y            = y_train, 
                              n_folds      = 10, 
                              shuffle      = True, 
                              random_state = SEED)

In [9]:
"""
[Using convolutional neural nets to detect facial keypoints; Daniel Nouri]
"""

def float32(k):
    """
    Cast a variable for inclusion in a theano.shared variable
    for subsequent use by AdjustVariable
    """
    return np.cast['float32'](k)

class AdjustVariable(object):
    """
    Change the 'name' parameter each epoch
    on a linspace(start, stop) schedule
    """
    def __init__(self, name, start=0.03, stop=0.001):
        self.name = name
        self.start, self.stop = start, stop
        self.ls = None

    def __call__(self, nn, train_history):
        if self.ls is None:
            self.ls = np.linspace(self.start, self.stop, nn.max_epochs)

        epoch = train_history[-1]['epoch']
        new_value = float32(self.ls[epoch - 1])
        getattr(nn, self.name).set_value(new_value)
        
class EarlyStopping_loss(object):
    """
    Stop the training process if the validation loss has not declined
    for 'patience' epochs
    The model returned has the weights from the epoch with the lowest validation loss.
    """
    def __init__(self, patience=100):
        self.patience         = patience
        self.best_valid       = np.inf
        self.best_valid_epoch = 0
        self.best_weights     = None
        global early_stop_type
        early_stop_type = 'loss'
        

    def __call__(self, nn, train_history):
        current_valid = train_history[-1]['valid_loss']
        current_epoch = train_history[-1]['epoch']
        if current_valid < self.best_valid:
            self.best_valid = current_valid
            self.best_valid_epoch = current_epoch
            self.best_weights = nn.get_all_params_values()
        elif self.best_valid_epoch + self.patience < current_epoch:
            print("\nEarly stopping.")
            print("Best valid loss was {:.6f} at epoch {}.\n".format(
                self.best_valid, self.best_valid_epoch))
            nn.load_params_from(self.best_weights)
            raise StopIteration()
            
class EarlyStopping_acc(object):
    """
    Stop the training process if the validation accuracy has not increased
    for 'patience' epochs
    The model returned has the weights from the epoch with the highest validation accuracy.
    """
    def __init__(self, patience=100):
        self.patience         = patience
        self.best_valid_acc   = 0
        self.best_valid_epoch = 0
        self.best_weights     = None
        global early_stop_type
        early_stop_type = 'acc'
        

    def __call__(self, nn, train_history):
        global early_stop
        current_valid_acc = train_history[-1]['valid_accuracy']
        current_epoch     = train_history[-1]['epoch']
        if current_valid_acc > self.best_valid_acc:
            self.best_valid_acc   = current_valid_acc
            self.best_valid_epoch = current_epoch
            self.best_weights     = nn.get_all_params_values()
        elif self.best_valid_epoch + self.patience < current_epoch:
            print("\nEarly stopping.")
            print("Best valid accuracy was {:.6f} at epoch {}.\n".format(
                self.best_valid_acc, self.best_valid_epoch))
            nn.load_params_from(self.best_weights)
            raise StopIteration()
            
def elastic_net(layers, lambdaL1=0., lambdaL2=0., *args, **kwargs):
    losses = objective(layers, *args, **kwargs)
    # get the layers' weights, but not the biases
    weights = get_all_params(layers[-1], regularizable=True)
    # sum of absolute weights for L1
    sum_abs_weights = sum([abs(w).sum() for w in weights])
    # sum of squared weights for L2
    sum_squared_weights = sum([(w ** 2).sum() for w in weights])
    # add weights to regular loss
    losses += lambdaL1 * sum_abs_weights + lambdaL2 * sum_squared_weights
    return losses

class FlipBatchIterator(BatchIterator):

    def transform(self, Xb, yb):
        Xb, yb = super(FlipBatchIterator, self).transform(Xb, yb)

        # Flip half of the images in this batch at random:
        bs = Xb.shape[0]
        indices = np.random.choice(bs, bs / 2, replace=False)
        Xb[indices] = Xb[indices, :, :, ::-1]

        return Xb, yb

In [10]:
"""
[Understanding the difficulty of training deep feedforward neural networks; Xavier Glorot, Yoshua Bengio]
"""

def xavierReLU(fan_in,fan_out):
    """
    Xavier Glorot and Yohua Bengio recommended weight initialization
    for layers using a ReLU activation function
    
    W ~ U[ +/- sqrt(6 / (fan_in + fan_out)) ]
    """
    import numpy as np
    return np.sqrt(6. / (fan_in + fan_out))

def xavierTanH(fan_in,fan_out):
    """
    Xavier Glorot and Yohua Bengio recommended weight initialization
    for layers using a tanh activation function
    
    W ~ U[ +/- 4 * sqrt(6 / (fan_in + fan_out)) ]
    """
    import numpy as np
    return 4. * np.sqrt(6. / (fan_in + fan_out))

In [11]:
"""
["Bagging Predictors" Department of Statistics UC Berkeley 1994; Leo Breiman]
"""

class BaggedNolearn(object):
    """
    Runs the nolearn base estimator 'n_estimators' times, fitting with
    the X_train indices drawn from a random sample of size X_train.shape[0] 
    with replacement
    
    Usage
    -----
    clf = BaggedNolearn(base_estimator, n_estimators)
    
        base_estimator is an instance of a nolearn object
        n_estimators   is the number of bagged replicas to use
        
    clf.fit(X_train, y_train)
    
        a no-op; merely stores X_train and y_train for use in the
        predict and predict_proba methods
        
    clf.predict(X_test)
    
        the value returned is the majority vote of each of the
        fitted base_estimators' class predictions
        
    clf.predict_proba(X_test)
    
        the value returned is the average of each of the
        fitted base_estimators' probabilities
    """
    def __init__(self, base_estimator, n_estimators=11):
        from copy import copy, deepcopy
        self.n_estimators   = n_estimators
        
        # Daniel Nouri didn't give me a __copy__ method
        # ---------------------------------------------
        # see 
        # http://stackoverflow.com/questions/1500718/ \
        # what-is-the-right-way-to-override-the-copy-deepcopy-operations-on-an-object-in-p
        def shallow_copy(self):
            cls = self.__class__
            result = cls.__new__(cls)
            result.__dict__.update(self.__dict__)
            return result

        def deep_copy(self, memo):
            cls = self.__class__
            result = cls.__new__(cls)
            memo[id(self)] = result
            for k, v in self.__dict__.items():
                setattr(result, k, deepcopy(v, memo))  
            return result

        base_estimator.__copy__     = shallow_copy
        base_estimator.__deepcopy__ = deep_copy
        
        self.base_estimator = base_estimator

        
    def fit(self, X_train, y_train):
        """
        The actual fitting is done in the predict & predict_proba methods
        """
        self.X_train = X_train
        self.y_train = y_train
        
            
    def predict(self, X_test):
        """
        Return the majority vote of 'n_estimators' fits
        """
        from copy import copy, deepcopy
        predictions = []
        for i in range(self.n_estimators):
            clf = copy(self.base_estimator)
            # this is required since it isn't reset by the copy operation
            clf.train_history_ = []
            idx = np.random.choice(a    = range(self.X_train.shape[0]), 
                                   size = self.X_train.shape[0], 
                                   replace=True, p=None)

            clf.fit(self.X_train[idx,:], self.y_train[idx],epochs = max_epochs)
            predictions.append(clf.predict(X_test))
            
        from scipy.stats import mode
        
        return mode(predictions)[0][0]
    
    def predict_proba(self, X_test):
        """
        Return the average probability matrix of 'n_estimators' fits
        """
        competed_bags = 0
        from copy import copy, deepcopy
        predictions = np.empty((X_test.shape[0], N_CLASSES), dtype=np.float32)
        for i in range(self.n_estimators):
            clf = copy(self.base_estimator)
            # this is required since it isn't reset by the copy operation
            clf.train_history_ = []
            idx = np.random.choice(a    = range(self.X_train.shape[0]), 
                                   size = self.X_train.shape[0], 
                                   replace=True, p=None)
            
            clf.fit(self.X_train[idx,:], self.y_train[idx],epochs = max_epochs)
            predictions += clf.predict_proba(X_test)
            competed_bags += 1
            
        return predictions / float(competed_bags)

In [12]:
%%time
"""
[Neural Networks and Deep Learning; Michael Nielsen ]
"""

random.seed(SEED)

# ----------------------------------- base estimator -------------------------------
clf_layers=[
            ('input',    InputLayer),
    
    
#             ('dropout1', DropoutLayer),
            ('dense1',   DenseLayer),
    
#             ('dropout2', DropoutLayer),
#             ('dense2',   DenseLayer),
    

            ('output',   DenseLayer),
            ]



base = NeuralNet(
    
    layers=clf_layers,
    
    regression=False,
    
    # --------------------------------------------------------------------------
    
    input_shape=(None, X_train.shape[1]), 
    
    # Fully Connected
    # ===============
#     dropout1_p=0.20,
    
    dense1_num_units    = 500,
    dense1_nonlinearity = lasagne.nonlinearities.LeakyRectify(leakiness=1./3.),
    dense1_W = lasagne.init.Uniform(range = xavierReLU(X_train.shape[1], 500)),
    dense1_b = lasagne.init.Constant(1.),
    
#     dropout2_p=0.02,
    
#     dense2_num_units    = 10,
#     dense2_nonlinearity = lasagne.nonlinearities.LeakyRectify(leakiness=1./3.),
#     dense2_W = lasagne.init.Uniform(range = xavierReLU(500, 10)),
#     dense2_b = lasagne.init.Constant(1.),
    

    # Output
    # ======
    output_num_units    = N_CLASSES, 
    output_nonlinearity = nonlinearities.softmax,
    output_W = lasagne.init.Normal(std=1./np.sqrt(500 + N_CLASSES), mean=0.0),
    output_b = lasagne.init.Constant(1.),
    
    # --------------------------------------------------------------------------
    
    # Loss Function
    # =============
    objective_loss_function = lasagne.objectives.categorical_crossentropy,
    
    # L1 & L2 Regularization
    # ======================
    objective          = elastic_net,
    objective_lambdaL1 = 0.0000,
    objective_lambdaL2 = 0.0000,
    
    # Update Function
    # ===============
    update               = lasagne.updates.momentum,
    update_learning_rate = theano.shared(float32(0.003)),
    update_momentum      = theano.shared(float32(0.9)),
    
    # Early Stopping
    # ==============
    on_epoch_finished=[
        AdjustVariable('update_learning_rate', start=0.003,  stop=0.0003),
        AdjustVariable('update_momentum',      start=0.9,    stop=0.99),
        
        EarlyStopping_loss(patience = 50)
        ],
    
    
    # Mini-Batch Size
    # ===============
    batch_iterator_train = BatchIterator(batch_size=1), 
    batch_iterator_test  = BatchIterator(batch_size=1),

    # Max Epochs
    # ==========
    max_epochs = max_epochs,
    
    verbose=0,
    train_split=TrainSplit(0.2, stratify=True)
    )

# ----------------------------------- BaggedNolearn -------------------------------
clf = BaggedNolearn(base_estimator = base, 
                    n_estimators   = 11)


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.17 ms

In [13]:
clf.fit(X_train.values.astype(np.float32)[range(X_train.shape[0]),:], y_train)

Training set predictions


In [14]:
%%time

predicted_values = clf.predict(X_train.values.astype(np.float32))
y_true, y_pred   = y_train, predicted_values

predicted_probs  = clf.predict_proba(X_train.values.astype(np.float32))
donate_probs     = [prob[1] for prob in predicted_probs]


CPU times: user 1min 17s, sys: 140 ms, total: 1min 17s
Wall time: 1min 17s

In [15]:
print(y_pred[:10])
print(predicted_probs[:10])
for i in range(10):
    a,b = predicted_probs[i]
    print a+b,
print
print(donate_probs[:10])


[0 0 0 1 1 0 1 0 0 0]
[[ 0.74874151  0.25125855]
 [ 0.92206496  0.07793499]
 [ 0.68541878  0.31458119]
 [ 0.42154506  0.57845491]
 [ 0.45503062  0.54496938]
 [ 0.78299659  0.21700343]
 [ 0.43890858  0.56109142]
 [ 0.79450411  0.20549589]
 [ 0.86417872  0.13582134]
 [ 0.77164102  0.22835892]]
1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
[0.25125855, 0.077934988, 0.31458119, 0.57845491, 0.54496938, 0.21700343, 0.56109142, 0.20549589, 0.13582134, 0.22835892]

In [16]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred, labels=None)
print cm

try:
    from sklearn_utilities import plot_confusion_matrix
except:
    import imp, os
    util = imp.load_source('sklearn_utilities', os.path.expanduser('~/Dropbox/Python/sklearn_utilities.py'))
    from sklearn_utilities import plot_confusion_matrix


plot_confusion_matrix(cm, ['Did not Donate','Donated'])

accuracy = round(np.trace(cm)/float(np.sum(cm)),4)
misclass = 1 - accuracy
print("Accuracy {}, mis-class rate {}".format(accuracy,misclass))


[[412  26]
 [ 89  49]]
Accuracy 0.8003, mis-class rate 0.1997

In [17]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score

fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=None)


plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)

AUC = roc_auc_score(y_true, y_pred, average='macro')
plt.text(x=0.6,y=0.4,s="AUC         {:.4f}"\
         .format(AUC),
        fontsize=16)

plt.text(x=0.6,y=0.3,s="accuracy {:.2f}%"\
         .format(accuracy*100),
        fontsize=16)

logloss = log_loss(y_true, y_pred)
plt.text(x=0.6,y=0.2,s="LogLoss   {:.4f}"\
         .format(logloss),
        fontsize=16)

f1 = f1_score(y_true, y_pred)
plt.text(x=0.6,y=0.1,s="f1             {:.4f}"\
         .format(f1),
        fontsize=16)

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()



In [18]:
# format for scores.csv file
import re
algo = re.search(r"submission_(.*?)\.csv", submission_filename).group(1)
print("{: <26} ,        ,   {:.4f} ,  {:.4f} , {:.4f} , {:.4f} ,  , "\
      .format(algo,accuracy,logloss,AUC,f1))


bagged_nolearnSEED1091     ,        ,   0.8003 ,  6.8958 , 0.6479 , 0.4601 ,  , 

confirm that sklearn.metrics.log_loss conforms to the contest formula


In [19]:
def log_loss(y_true, y_pred):
    """
    logloss formula
    """
    import scipy
    import math
    
    assert len(y_true)==len(y_pred)
    
    n     = len(y_true)
    comps = []
    
    # prevent log(0/1): y_pred \in [epsilon, 0.999]
    epsilon = 1e-15
    y_pred = scipy.maximum(epsilon, y_pred)
    y_pred = scipy.minimum(1-epsilon, y_pred)
    
    for i in range(n):
        comp1 =    y_true[i]    * math.log(  y_pred[i],     math.e)
        comp2 = (1 - y_true[i]) * math.log((1 - y_pred[i]), math.e)
        comps.append(comp1+comp2)
    
    return round(-sum(comps)/float(n),4)

log_loss(y_true,y_pred)


Out[19]:
6.8958

--------------------------------------------------------------------------------------------

Test Set Predictions

Re-fit with the full training set


In [20]:
# clf.fit(X_train.values.astype(np.float32)[range(X_train.shape[0]),:], y_train)

Read the test data


In [21]:
from load_blood_data import load_blood_data

X_test, IDs = load_blood_data(train=False, SEED   = SEED, 
                                           scale  = scale,
                                           minmax = minmax,
                                           norm   = norm,
                                           nointercept = nointercept,
                                           engineering = engineering)

Predict the test set with the fitted model


In [22]:
y_pred        = clf.predict(X_test.values.astype(np.float32))
y_pred_probs  = clf.predict_proba(X_test.values.astype(np.float32))
donate_probs  = [prob[1] for prob in y_pred_probs]


Early stopping.
Best valid loss was 0.356069 at epoch 5.


Early stopping.
Best valid loss was 0.356069 at epoch 5.


In [23]:
print(y_pred[:10])
print(y_pred_probs[:10])
for i in range(10):
    a,b = y_pred_probs[i]
    print a+b,
print
print(donate_probs[:10])


[0 0 0 0 1 1 0 0 0 0]
[[ 0.46867129  0.5313288 ]
 [ 0.89952683  0.10047319]
 [ 0.78125536  0.21874465]
 [ 0.69834393  0.30165613]
 [ 0.38270131  0.61729872]
 [ 0.24454579  0.75545412]
 [ 0.71210158  0.28789848]
 [ 0.89409262  0.10590737]
 [ 0.98916507  0.01083487]
 [ 0.93410969  0.06589027]]
1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
[0.5313288, 0.10047319, 0.21874465, 0.30165613, 0.61729872, 0.75545412, 0.28789848, 0.10590737, 0.010834872, 0.065890275]

Create the submission file


In [24]:
assert len(IDs)==len(donate_probs)

f = open(submission_filename, "w")

f.write(",Made Donation in March 2007\n")
for ID, prob in zip(IDs, donate_probs):
    f.write("{},{}\n".format(ID,prob))
    
f.close()

In [ ]: