In [1]:
%matplotlib inline

In [2]:
"""
A deep neural network with or w/o dropout in one notebook.

To train the networks you just need to call the run() function
You need to run each cell before doing so.
To do so, you can go to "Cell -> Run All"
Feel free to edit individually each cell and rerun modified ones.

You can change global parameters here before running
"""

# Which experiment to run
MNIST = True
DIGITS = False
FACES = False
TWENTYNEWSGROUPS = False

BATCH_SIZE = 100  # default batch size
L2_LAMBDA = 1.    # default L2 regularization parameter
INIT_LR = 0.01    # initial learning rate, try making it larger

In [3]:
import numpy
import theano
import sys
import math
from theano import tensor as T
from theano import shared
from theano.tensor.shared_randomstreams import RandomStreams
from collections import OrderedDict

In [4]:
#Activation and helper functions

def relu_f(vec):
    """ Wrapper to quickly change the rectified linear unit function """
    return (vec + abs(vec)) / 2.


def softplus_f(v):
    return T.log(1 + T.exp(v))


def dropout(rng, x, p=0.5):
    """ Zero-out random values in x with probability p using rng """
    if p > 0. and p < 1.:
        seed = rng.randint(2 ** 30)
        srng = theano.tensor.shared_randomstreams.RandomStreams(seed)
        mask = srng.binomial(n=1, p=1.-p, size=x.shape,
                dtype=theano.config.floatX)
        return x * mask
    return x


def build_shared_zeros(shape, name):
    """ Builds a theano shared variable filled with a zeros numpy array """
    return shared(value=numpy.zeros(shape, dtype=theano.config.floatX),
            name=name, borrow=True)


class Linear(object):
    """ Basic linear transformation layer (W.X + b) """
    def __init__(self, rng, input, n_in, n_out, W=None, b=None):
        if W is None:
            W_values = numpy.asarray(rng.uniform(
                low=-numpy.sqrt(6. / (n_in + n_out)),
                high=numpy.sqrt(6. / (n_in + n_out)),
                size=(n_in, n_out)), dtype=theano.config.floatX)
            W_values *= 4  # This works for sigmoid activated networks!
            W = theano.shared(value=W_values, name='W', borrow=True)
        if b is None:
            b = build_shared_zeros((n_out,), 'b')
        self.input = input
        self.W = W
        self.b = b
        self.params = [self.W, self.b]
        self.output = T.dot(self.input, self.W) + self.b

    def __repr__(self):
        return "Linear"


class SigmoidLayer(Linear):
    """ Sigmoid activation layer (sigmoid(W.X + b)) """
    def __init__(self, rng, input, n_in, n_out, W=None, b=None):
        super(SigmoidLayer, self).__init__(rng, input, n_in, n_out, W, b)
        self.pre_activation = self.output
        self.output = T.nnet.sigmoid(self.pre_activation)


class ReLU(Linear):
    """ Rectified Linear Unit activation layer (max(0, W.X + b)) """
    def __init__(self, rng, input, n_in, n_out, W=None, b=None):
        if b is None:
            b = build_shared_zeros((n_out,), 'b')
        super(ReLU, self).__init__(rng, input, n_in, n_out, W, b)
        self.pre_activation = self.output
        self.output = relu_f(self.pre_activation)


class SoftPlus(Linear):
    def __init__(self, rng, input, n_in, n_out, W=None, b=None):
        if b is None:
            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)
        super(SoftPlus, self).__init__(rng, input, n_in, n_out, W, b)
        self.pre_activation = self.output
        self.output = softplus_f(self.pre_activation)


class DatasetMiniBatchIterator(object):
    """ Basic mini-batch iterator """
    def __init__(self, x, y, batch_size=BATCH_SIZE, randomize=False):
        self.x = x
        self.y = y
        self.batch_size = batch_size
        self.randomize = randomize
        from sklearn.utils import check_random_state
        self.rng = check_random_state(42)

    def __iter__(self):
        n_samples = self.x.shape[0]
        if self.randomize:
            for _ in xrange(n_samples / BATCH_SIZE):
                if BATCH_SIZE > 1:
                    i = int(self.rng.rand(1) * ((n_samples+BATCH_SIZE-1) / BATCH_SIZE))
                else:
                    i = int(math.floor(self.rng.rand(1) * n_samples))
                yield (i, self.x[i*self.batch_size:(i+1)*self.batch_size],
                       self.y[i*self.batch_size:(i+1)*self.batch_size])
        else:
            for i in xrange((n_samples + self.batch_size - 1)
                            / self.batch_size):
                yield (self.x[i*self.batch_size:(i+1)*self.batch_size],
                       self.y[i*self.batch_size:(i+1)*self.batch_size])

In [5]:
class LogisticRegression:
    """ _Multi-class_ Logistic Regression """
    def __init__(self, rng, input, n_in, n_out, W=None, b=None):
        if W != None:
            self.W = W
        else:
            self.W = build_shared_zeros((n_in, n_out), 'W')
        if b != None:
            self.b = b
        else:
            self.b = build_shared_zeros((n_out,), 'b')
        self.input = input
        self.p_y_given_x = T.nnet.softmax(T.dot(self.input, self.W) + self.b)
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        self.output = self.y_pred
        self.params = [self.W, self.b]

    def negative_log_likelihood(self, y):
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])

    def negative_log_likelihood_sum(self, y):
        return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])

    def training_cost(self, y):
        """ Wrapper for standard name """
        return self.negative_log_likelihood(y)

    def errors(self, y):
        if y.ndim != self.y_pred.ndim:
            raise TypeError("!!! 'y' should have the same shape as 'self.y_pred'",
                ("y", y.type, "y_pred", self.y_pred.type))
        if y.dtype.startswith('int'):
            return T.mean(T.neq(self.y_pred, y))
        else:
            print("!!! y should be of int type")
            return T.mean(T.neq(self.y_pred, numpy.asarray(y, dtype='int')))
        
        
#class PerceptronLoss: # TODO

In [12]:
class NeuralNet(object):
    """ Neural network (not regularized, without dropout) """
    def __init__(self, numpy_rng, theano_rng=None, 
                 n_ins=40*3,
                 layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
                 layers_sizes=[1024, 1024, 1024, 1024],
                 n_outs=62*3,
                 rho=0.95, eps=1.E-6,
                 momentum=0.9, step_adapt_alpha=1.E-4,
                 b1=0.1, b2=0.001,
                 debugprint=False):
        """
        Basic Neural Net class
        """
        self.layers = []
        self.params = []
        self.n_layers = len(layers_types)
        self.layers_types = layers_types
        assert self.n_layers > 0
        self._rho = rho  # ``momentum'' for adadelta (and discount/decay for RMSprop), alpha for Adam
        self._eps = eps  # epsilon for adadelta (and for RMSprop and Adam)
        self._momentum = momentum  # for RMSProp
        self._b1 = b1
        self._b2 = b2
        self._accugrads = []  # for adadelta
        self._accudeltas = []  # for adadelta
        self._avggrads = []  # for RMSprop in the Alex Graves' variant
        self._stepadapts = []  # for RMSprop with step adaptations
        self._stepadapt_alpha = step_adapt_alpha

        if theano_rng == None:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        self.x = T.fmatrix('x')
        self.y = T.ivector('y')
        
        self.layers_ins = [n_ins] + layers_sizes
        self.layers_outs = layers_sizes + [n_outs]
        
        layer_input = self.x
        
        for layer_type, n_in, n_out in zip(layers_types,
                self.layers_ins, self.layers_outs):
            this_layer = layer_type(rng=numpy_rng,
                    input=layer_input, n_in=n_in, n_out=n_out)
            assert hasattr(this_layer, 'output')
            self.params.extend(this_layer.params)
            self._accugrads.extend([build_shared_zeros(t.shape.eval(),
                'accugrad') for t in this_layer.params])
            self._accudeltas.extend([build_shared_zeros(t.shape.eval(),
                'accudelta') for t in this_layer.params])
            self._avggrads.extend([build_shared_zeros(t.shape.eval(),
                'avggrad') for t in this_layer.params])
            self._stepadapts.extend([shared(value=numpy.ones(t.shape.eval(),
                dtype=theano.config.floatX),
                name='stepadapt', borrow=True) for t in this_layer.params])
            self.layers.append(this_layer)
            layer_input = this_layer.output

        assert hasattr(self.layers[-1], 'training_cost')
        assert hasattr(self.layers[-1], 'errors')
        self.mean_cost = self.layers[-1].negative_log_likelihood(self.y)
        self.cost = self.layers[-1].training_cost(self.y)
        if debugprint:
            theano.printing.debugprint(self.cost)

        self.errors = self.layers[-1].errors(self.y)

    def __repr__(self):
        dimensions_layers_str = map(lambda x: "x".join(map(str, x)),
                                    zip(self.layers_ins, self.layers_outs))
        return "_".join(map(lambda x: "_".join((x[0].__name__, x[1])),
                            zip(self.layers_types, dimensions_layers_str)))


    def get_SGD_trainer(self):
        """ Returns a plain SGD minibatch trainer with learning rate as param. """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate
        gparams = T.grad(self.mean_cost, self.params)  # all the gradients
        updates = OrderedDict()
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * learning_rate

        train_fn = theano.function(inputs=[theano.Param(batch_x),
                                           theano.Param(batch_y),
                                           theano.Param(learning_rate)],
                                   outputs=self.mean_cost,
                                   updates=updates,
                                   givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def get_adagrad_trainer(self):
        """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate
        gparams = T.grad(self.mean_cost, self.params)  # all the gradients
        updates = OrderedDict()
        for accugrad, param, gparam in zip(self._accugrads, self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = accugrad + gparam * gparam
            dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y),
            theano.Param(learning_rate)],
            outputs=self.mean_cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def get_adadelta_trainer(self):
        """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and
        self._eps params. """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        gparams = T.grad(self.mean_cost, self.params)
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads,
                self._accudeltas, self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps)
                          / (agrad + self._eps)) * gparam
            updates[accudelta] = (self._rho * accudelta
                                  + (1 - self._rho) * dx * dx)
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(inputs=[theano.Param(batch_x),
                                           theano.Param(batch_y)],
                                   outputs=self.mean_cost,
                                   updates=updates,
                                   givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def get_rmsprop_trainer(self, with_step_adapt=True, nesterov=False):  # TODO Nesterov momentum
        """ Returns an RmsProp (possibly Nesterov) (Sutskever 2013) trainer
        using self._rho, self._eps and self._momentum params. """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate
        gparams = T.grad(self.mean_cost, self.params)
        updates = OrderedDict()
        for accugrad, avggrad, accudelta, sa, param, gparam in zip(
                self._accugrads, self._avggrads, self._accudeltas,
                self._stepadapts, self.params, gparams):
            acc_grad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            avg_grad = self._rho * avggrad + (1 - self._rho) * gparam  # this decay/discount (self._rho) should differ from the one of the line above
            ###scaled_grad = gparam / T.sqrt(acc_grad + self._eps)  # original RMSprop gradient scaling
            scaled_grad = gparam / T.sqrt(acc_grad - avg_grad**2 + self._eps)  # Alex Graves' RMSprop variant (divide by a "running stddev" of the updates)
            if with_step_adapt:
                incr = sa * (1. + self._stepadapt_alpha)
                #decr = sa * (1. - self._stepadapt_alpha)
                decr = sa * (1. - 2*self._stepadapt_alpha)
                ###steps = sa * T.switch(accudelta * -gparam >= 0, incr, decr)
                steps = T.clip(T.switch(accudelta * -gparam >= 0, incr, decr), self._eps, 1./self._eps)  # bad overloading of self._eps!
                scaled_grad = steps * scaled_grad
                updates[sa] = steps
            dx = self._momentum * accudelta - learning_rate * scaled_grad
            updates[param] = param + dx
            updates[accugrad] = acc_grad
            updates[avggrad] = avg_grad
            updates[accudelta] = dx

        train_fn = theano.function(inputs=[theano.Param(batch_x),
                                           theano.Param(batch_y),
                                           theano.Param(learning_rate)],
                                   outputs=self.mean_cost,
                                   updates=updates,
                                   givens={self.x: batch_x, self.y: batch_y})

        return train_fn
    
    def get_adam_trainer(self):
        """ Returns an Adam trainer as described in 
        ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION, Kingma and Ba 2015 """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        self._i_t = shared(numpy.float32(0.))
        self.lr = 0.0002
        i_t = self._i_t + 1.
        fix1 = 1. - (1. - self._b1)**i_t
        fix2 = 1. - (1. - self._b2)**i_t
        lr_t = self.lr * (T.sqrt(fix2) / fix1)
        gparams = T.grad(self.mean_cost, self.params)
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams):
            m_t = (self._b1 * gparam) + ((1. - self._b1) * accugrad)
            v_t = (self._b2 * T.sqr(gparam)) + ((1. - self._b2) * accudelta)
            #m_t_h = m_t / (1 - (1-self._b1)**i_t)
            #v_t_h = v_t / (1 - (1-self._b2)**i_t)
            #g_t = m_t_h / (T.sqrt(v_t_h) + self._eps)
            g_t = m_t / (T.sqrt(v_t) + self._eps)
            updates[param] = param - (lr_t * g_t)
            updates[accugrad] = m_t
            updates[accudelta] = v_t
        updates[self._i_t] = i_t
        
        train_fn = theano.function(inputs=[theano.Param(batch_x),
                                           theano.Param(batch_y)],
                                   outputs=self.mean_cost,
                                   updates=updates,
                                   givens={self.x: batch_x, self.y: batch_y})

        return train_fn
        

    def score_classif(self, given_set):
        """ Returns functions to get current classification errors. """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        score = theano.function(inputs=[theano.Param(batch_x),
                                        theano.Param(batch_y)],
                                outputs=self.errors,
                                givens={self.x: batch_x, self.y: batch_y})

        def scoref():
            """ returned function that scans the entire set given as input """
            return [score(batch_x, batch_y) for batch_x, batch_y in given_set]

        return scoref

In [18]:
class RegularizedNet(NeuralNet):
    """ Neural net with L1 and L2 regularization """
    def __init__(self, numpy_rng, theano_rng=None,
                 n_ins=100,
                 layers_types=[ReLU, ReLU, ReLU, LogisticRegression],
                 layers_sizes=[1024, 1024, 1024],
                 n_outs=2,
                 rho=0.95, eps=1.E-8,
                 L1_reg=0.1,
                 L2_reg=0.1,
                 debugprint=False):
        """
        A deep neural net with possible L1 and/or L2 regularization.
        """
        super(RegularizedNet, self).__init__(numpy_rng, theano_rng, n_ins,
                layers_types, layers_sizes, n_outs, rho, eps, debugprint)

        self.L1_reg = L1_reg
        self.L2_reg = L2_reg
        L1 = shared(0.)
        for param in self.params:
            L1 += T.sum(abs(param))
        if L1_reg > 0.:
            self.cost = self.cost + L1_reg * L1
        L2 = shared(0.)
        for param in self.params:
            L2 += T.sum(param ** 2)
        if L2_reg > 0.:
            self.cost = self.cost + L2_reg * L2

In [19]:
class DropoutNet(NeuralNet):
    """ Neural net with dropout (see Hinton's et al. paper) """
    def __init__(self, numpy_rng, theano_rng=None,
                 n_ins=40*3,
                 layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
                 layers_sizes=[4000, 4000, 4000, 4000],
                 dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5],
                 n_outs=62 * 3,
                 rho=0.98, eps=1.E-6,
                 debugprint=False):
        """
        A dropout-regularized neural net.
        """
        super(DropoutNet, self).__init__(numpy_rng, theano_rng, n_ins,
                layers_types, layers_sizes, n_outs, rho, eps, debugprint)

        self.dropout_rates = dropout_rates
        dropout_layer_input = dropout(numpy_rng, self.x, p=dropout_rates[0])
        self.dropout_layers = []

        for layer, layer_type, n_in, n_out, dr in zip(self.layers,
                layers_types, self.layers_ins, self.layers_outs,
                dropout_rates[1:] + [0]):  # !!! we do not dropout anything
                                           # from the last layer !!!
            if dr:
                this_layer = layer_type(rng=numpy_rng,
                        input=dropout_layer_input, n_in=n_in, n_out=n_out,
                        W=layer.W * 1. / (1. - dr),
                        b=layer.b * 1. / (1. - dr))
                # N.B. dropout with dr==1 does not dropanything!!
                this_layer.output = dropout(numpy_rng, this_layer.output, dr)
            else:
                this_layer = layer_type(rng=numpy_rng,
                        input=dropout_layer_input, n_in=n_in, n_out=n_out,
                        W=layer.W, b=layer.b)

            assert hasattr(this_layer, 'output')
            self.dropout_layers.append(this_layer)
            dropout_layer_input = this_layer.output

        assert hasattr(self.layers[-1], 'training_cost')
        assert hasattr(self.layers[-1], 'errors')
        # TODO standardize cost
        # these are the dropout costs
        self.mean_cost = self.dropout_layers[-1].negative_log_likelihood(self.y)
        self.cost = self.dropout_layers[-1].training_cost(self.y)

        # these is the non-dropout errors
        self.errors = self.layers[-1].errors(self.y)

    def __repr__(self):
        return super(DropoutNet, self).__repr__() + "\n"\
                + "dropout rates: " + str(self.dropout_rates)

In [20]:
def add_fit_and_score(class_to_chg):
    """ Mutates a class to add the fit() and score() functions to a NeuralNet.
    """
    from types import MethodType
    def fit(self, x_train, y_train, x_dev=None, y_dev=None,
            max_epochs=20, early_stopping=True, split_ratio=0.1, # TODO 100+ epochs
            method='adadelta', verbose=False, plot=False):
        """
        TODO
        """
        import time, copy
        if x_dev == None or y_dev == None:
            from sklearn.cross_validation import train_test_split
            x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train,
                    test_size=split_ratio, random_state=42)
        if method == 'sgd':
            train_fn = self.get_SGD_trainer()
        elif method == 'adagrad':
            train_fn = self.get_adagrad_trainer()
        elif method == 'adadelta':
            train_fn = self.get_adadelta_trainer()
        elif method == 'adam':
            train_fn = self.get_adam_trainer()
        elif method == 'rmsprop':
            train_fn = self.get_rmsprop_trainer(with_step_adapt=True,
                    nesterov=False)
        train_set_iterator = DatasetMiniBatchIterator(x_train, y_train)
        dev_set_iterator = DatasetMiniBatchIterator(x_dev, y_dev)
        train_scoref = self.score_classif(train_set_iterator)
        dev_scoref = self.score_classif(dev_set_iterator)
        best_dev_loss = numpy.inf
        epoch = 0
        # TODO early stopping (not just cross val, also stop training)
        if plot:
            verbose = True
            self._costs = []
            self._train_errors = []
            self._dev_errors = []
            self._updates = []

        init_lr = INIT_LR
        if method == 'rmsprop':
            init_lr = 1.E-6  # TODO REMOVE HACK
        n_seen = 0
        while epoch < max_epochs:
            #first TODO: update learning rates when not using adadelta!
            #lr = init_lr / (1 + init_lr * L2_LAMBDA * math.log(1+n_seen))
            #lr = init_lr / math.sqrt(1 + init_lr * L2_LAMBDA * n_seen/BATCH_SIZE) # try these
            lr = init_lr
            if not verbose:
                sys.stdout.write("\r%0.2f%%" % (epoch * 100./ max_epochs))
                sys.stdout.flush()
            avg_costs = []
            timer = time.time()
            for x, y in train_set_iterator:
                if method == 'sgd' or method == 'adagrad' or method == 'rmsprop':
                    #avg_cost = train_fn(x, y, lr=1.E-2)
                    avg_cost = train_fn(x, y, lr=lr)
                elif method == 'adadelta' or method == 'adam':
                    avg_cost = train_fn(x, y)
                if type(avg_cost) == list:
                    avg_costs.append(avg_cost[0])
                else:
                    avg_costs.append(avg_cost)
            if verbose:
                mean_costs = numpy.mean(avg_costs)
                mean_train_errors = numpy.mean(train_scoref())
                print('  epoch %i took %f seconds' %
                      (epoch, time.time() - timer))
                print('  epoch %i, avg costs %f' %
                      (epoch, mean_costs))
                print('  epoch %i, training error %f' %
                      (epoch, mean_train_errors))
                if plot:
                    self._costs.append(mean_costs)
                    self._train_errors.append(mean_train_errors)
            dev_errors = numpy.mean(dev_scoref())
            if plot:
                self._dev_errors.append(dev_errors)
            if dev_errors < best_dev_loss:
                best_dev_loss = dev_errors
                best_params = copy.deepcopy(self.params)
                if verbose:
                    print('!!!  epoch %i, validation error of best model %f' %
                          (epoch, dev_errors))
            epoch += 1
            n_seen += x_train.shape[0]
        if not verbose:
            print("")
        for i, param in enumerate(best_params):
            self.params[i] = param

    def score(self, x, y):
        """ error rates """
        iterator = DatasetMiniBatchIterator(x, y)
        scoref = self.score_classif(iterator)
        return numpy.mean(scoref())

    class_to_chg.fit = MethodType(fit, None, class_to_chg)
    class_to_chg.score = MethodType(score, None, class_to_chg)


def train_models(x_train, y_train, x_test, y_test, n_features, n_outs,
        x_dev=None, y_dev=None,
        use_dropout=False, n_epochs=100, numpy_rng=None,
        svms=False, nb=False, deepnn=True,
        verbose=False, plot=False, name=''):
    if svms:
        print("Linear SVM")
        classifier = svm.SVC(gamma=0.001)
        print(classifier)
        classifier.fit(x_train, y_train)
        print("score: %f" % classifier.score(x_test, y_test))

        print("RBF-kernel SVM")
        classifier = svm.SVC(kernel='rbf', class_weight='auto')
        print(classifier)
        classifier.fit(x_train, y_train)
        print("score: %f" % classifier.score(x_test, y_test))

    if nb:
        print("Multinomial Naive Bayes")
        classifier = naive_bayes.MultinomialNB()
        print(classifier)
        classifier.fit(x_train, y_train)
        print("score: %f" % classifier.score(x_test, y_test))

    if deepnn:
        import warnings
        warnings.filterwarnings("ignore")  # TODO remove

        if use_dropout:
            n_epochs *= 4
            pass

        def new_dnn(dropout=False):
            if dropout:
                print("Dropout DNN")
                return DropoutNet(numpy_rng=numpy_rng, n_ins=n_features,
                    layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
                    layers_sizes=[2000, 2000, 2000, 2000],
                    dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5],
                    n_outs=n_outs,
                    debugprint=0)
            else:
                print("Simple (regularized) DNN")
                return RegularizedNet(numpy_rng=numpy_rng, n_ins=n_features,
                    layers_types=[LogisticRegression],
                    layers_sizes=[],
                    #layers_types=[ReLU, ReLU, ReLU, LogisticRegression],
                    #layers_sizes=[1000, 1000, 1000],
                    #layers_types=[ReLU, LogisticRegression],
                    #layers_sizes=[200],
                    n_outs=n_outs,
                    L1_reg=0.,
                    L2_reg=L2_LAMBDA,
                    debugprint=1)

        import matplotlib.pyplot as plt
        fig = plt.gcf() #plt.figure()
        fig.set_size_inches(16,10)
        ax1 = plt.subplot(221)
        ax2 = plt.subplot(222)
        ax3 = plt.subplot(223)
        ax4 = plt.subplot(224)  # TODO updates of the weights
        #methods = ['sgd', 'adagrad', 'adadelta']
        methods = ['adam', 'adadelta']
        #methods = ['rmsprop', 'adadelta', 'adagrad']
        for method in methods:
            dnn = new_dnn(use_dropout)
            print dnn
            dnn.fit(x_train, y_train, x_dev, y_dev, max_epochs=n_epochs,
                    method=method, verbose=verbose, plot=plot)
            test_error = dnn.score(x_test, y_test)
            print("score: %f" % (1. - test_error))
            ax1.plot(numpy.log10(dnn._costs), label=method)
            #ax2.plot(numpy.log10(dnn._train_errors), label=method)
            #ax3.plot(numpy.log10(dnn._dev_errors), label=method)
            ax2.plot(dnn._train_errors, label=method)
            ax3.plot(dnn._dev_errors, label=method)
            #ax4.plot(dnn._updates, label=method) TODO
            ax4.plot([test_error for _ in range(10)], label=method)
        ax1.set_xlabel('epoch')
        ax1.set_ylabel('cost (log10)')
        ax2.set_xlabel('epoch')
        ax2.set_ylabel('train error')
        ax3.set_xlabel('epoch')
        ax3.set_ylabel('dev error')
        ax4.set_ylabel('test error')
        plt.legend()
        plt.tight_layout()
        plt.show()

In [21]:
#main function

def run():
    add_fit_and_score(DropoutNet)
    add_fit_and_score(RegularizedNet)
    from sklearn import datasets, svm, naive_bayes
    from sklearn import cross_validation, preprocessing
    

    if MNIST:
        from sklearn.datasets import fetch_mldata
        mnist = fetch_mldata('MNIST original')
        X = numpy.asarray(mnist.data, dtype='float32')
        #X = preprocessing.scale(X)
        X /= 255.
        y = numpy.asarray(mnist.target, dtype='int32')
        print("Total dataset size:")
        print("n samples: %d" % X.shape[0])
        print("n features: %d" % X.shape[1])
        print("n classes: %d" % len(set(y)))
        x_train, x_test = X[:-10000], X[-10000:]
        y_train, y_test = y[:-10000], y[-10000:]

        train_models(x_train, y_train, x_test, y_test, X.shape[1],
                     len(set(y)), numpy_rng=numpy.random.RandomState(123),
                     use_dropout=False, n_epochs=30,
                     verbose=True, plot=True, name='mnist_L2')
        #train_models(x_train, y_train, x_test, y_test, X.shape[1],
        #             len(set(y)), numpy_rng=numpy.random.RandomState(123),
        #             use_dropout=True,
        #             verbose=True, plot=True, name='mnist_dropout')

    if DIGITS:
        digits = datasets.load_digits()
        data = numpy.asarray(digits.data, dtype='float32')
        target = numpy.asarray(digits.target, dtype='int32')
        x = preprocessing.scale(data)
        x_train, x_test, y_train, y_test = cross_validation.train_test_split(
                x, target, test_size=0.2, random_state=42)
        train_models(x_train, y_train, x_test, y_test, x.shape[1],
                     len(set(target)), numpy_rng=numpy.random.RandomState(123),
                     verbose=True, plot=True, name='digits')

    if FACES:
        import logging
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s %(message)s')
        lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70,
                                               resize=0.4)
        X = numpy.asarray(lfw_people.data, dtype='float32')
        X = preprocessing.scale(X)
        y = numpy.asarray(lfw_people.target, dtype='int32')
        target_names = lfw_people.target_names
        print("Total dataset size:")
        print("n samples: %d" % X.shape[0])
        print("n features: %d" % X.shape[1])
        print("n classes: %d" % target_names.shape[0])
        x_train, x_test, y_train, y_test = cross_validation.train_test_split(
                    X, y, test_size=0.2, random_state=42)

        train_models(x_train, y_train, x_test, y_test, X.shape[1],
                     len(set(y)), numpy_rng=numpy.random.RandomState(123),
                     verbose=True, plot=True, name='faces')

    if TWENTYNEWSGROUPS:
        from sklearn.feature_extraction.text import TfidfVectorizer
        newsgroups_train = datasets.fetch_20newsgroups(subset='train')
        vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000)
        #vectorizer = HashingVectorizer(encoding='latin-1')
        x_train = vectorizer.fit_transform(newsgroups_train.data)
        x_train = numpy.asarray(x_train.todense(), dtype='float32')
        y_train = numpy.asarray(newsgroups_train.target, dtype='int32')
        newsgroups_test = datasets.fetch_20newsgroups(subset='test')
        x_test = vectorizer.transform(newsgroups_test.data)
        x_test = numpy.asarray(x_test.todense(), dtype='float32')
        y_test = numpy.asarray(newsgroups_test.target, dtype='int32')
        train_models(x_train, y_train, x_test, y_test, x_train.shape[1],
                     len(set(y_train)),
                     numpy_rng=numpy.random.RandomState(123),
                     svms=False, nb=True, deepnn=True,
                     verbose=True, plot=True, name='20newsgroups')

In [22]:
run()


Total dataset size:
n samples: 70000
n features: 784
n classes: 10
Simple (regularized) DNN
LogisticRegression_784x10
  epoch 0 took 0.269893 seconds
  epoch 0, avg costs 1.243363
  epoch 0, training error 0.155833
!!!  epoch 0, validation error of best model 0.153500
  epoch 1 took 0.297565 seconds
  epoch 1, avg costs 0.633141
  epoch 1, training error 0.126481
!!!  epoch 1, validation error of best model 0.126500
  epoch 2 took 0.285746 seconds
  epoch 2, avg costs 0.492538
  epoch 2, training error 0.112778
!!!  epoch 2, validation error of best model 0.115833
  epoch 3 took 0.299956 seconds
  epoch 3, avg costs 0.428124
  epoch 3, training error 0.104481
!!!  epoch 3, validation error of best model 0.109833
  epoch 4 took 0.291172 seconds
  epoch 4, avg costs 0.390489
  epoch 4, training error 0.099389
!!!  epoch 4, validation error of best model 0.103000
  epoch 5 took 0.316594 seconds
  epoch 5, avg costs 0.365602
  epoch 5, training error 0.095093
!!!  epoch 5, validation error of best model 0.099667
  epoch 6 took 0.281905 seconds
  epoch 6, avg costs 0.347869
  epoch 6, training error 0.091426
!!!  epoch 6, validation error of best model 0.097167
  epoch 7 took 0.281674 seconds
  epoch 7, avg costs 0.334582
  epoch 7, training error 0.089111
!!!  epoch 7, validation error of best model 0.094500
  epoch 8 took 0.288380 seconds
  epoch 8, avg costs 0.324257
  epoch 8, training error 0.086981
!!!  epoch 8, validation error of best model 0.093000
  epoch 9 took 0.286906 seconds
  epoch 9, avg costs 0.316005
  epoch 9, training error 0.085222
!!!  epoch 9, validation error of best model 0.092000
  epoch 10 took 0.292030 seconds
  epoch 10, avg costs 0.309255
  epoch 10, training error 0.083815
!!!  epoch 10, validation error of best model 0.090500
  epoch 11 took 0.279655 seconds
  epoch 11, avg costs 0.303626
  epoch 11, training error 0.082685
!!!  epoch 11, validation error of best model 0.089000
  epoch 12 took 0.294217 seconds
  epoch 12, avg costs 0.298852
  epoch 12, training error 0.081741
!!!  epoch 12, validation error of best model 0.088667
  epoch 13 took 0.304461 seconds
  epoch 13, avg costs 0.294745
  epoch 13, training error 0.080519
!!!  epoch 13, validation error of best model 0.087167
  epoch 14 took 0.279682 seconds
  epoch 14, avg costs 0.291166
  epoch 14, training error 0.079500
!!!  epoch 14, validation error of best model 0.086667
  epoch 15 took 0.277173 seconds
  epoch 15, avg costs 0.288012
  epoch 15, training error 0.078667
!!!  epoch 15, validation error of best model 0.086167
  epoch 16 took 0.293082 seconds
  epoch 16, avg costs 0.285206
  epoch 16, training error 0.078111
!!!  epoch 16, validation error of best model 0.085667
  epoch 17 took 0.302496 seconds
  epoch 17, avg costs 0.282688
  epoch 17, training error 0.077463
!!!  epoch 17, validation error of best model 0.085167
  epoch 18 took 0.283735 seconds
  epoch 18, avg costs 0.280410
  epoch 18, training error 0.076889
!!!  epoch 18, validation error of best model 0.084667
  epoch 19 took 0.287550 seconds
  epoch 19, avg costs 0.278337
  epoch 19, training error 0.076574
!!!  epoch 19, validation error of best model 0.084500
  epoch 20 took 0.281636 seconds
  epoch 20, avg costs 0.276438
  epoch 20, training error 0.076111
!!!  epoch 20, validation error of best model 0.083833
  epoch 21 took 0.289907 seconds
  epoch 21, avg costs 0.274690
  epoch 21, training error 0.075519
  epoch 22 took 0.290426 seconds
  epoch 22, avg costs 0.273072
  epoch 22, training error 0.075111
!!!  epoch 22, validation error of best model 0.083500
  epoch 23 took 0.283991 seconds
  epoch 23, avg costs 0.271569
  epoch 23, training error 0.074741
  epoch 24 took 0.276365 seconds
  epoch 24, avg costs 0.270167
  epoch 24, training error 0.074352
  epoch 25 took 0.326764 seconds
  epoch 25, avg costs 0.268854
  epoch 25, training error 0.073870
  epoch 26 took 0.307149 seconds
  epoch 26, avg costs 0.267622
  epoch 26, training error 0.073667
!!!  epoch 26, validation error of best model 0.083167
  epoch 27 took 0.313843 seconds
  epoch 27, avg costs 0.266461
  epoch 27, training error 0.073296
!!!  epoch 27, validation error of best model 0.082833
  epoch 28 took 0.289039 seconds
  epoch 28, avg costs 0.265364
  epoch 28, training error 0.073111
  epoch 29 took 0.280149 seconds
  epoch 29, avg costs 0.264326
  epoch 29, training error 0.072611
!!!  epoch 29, validation error of best model 0.082667
score: 0.924200
Simple (regularized) DNN
LogisticRegression_784x10
  epoch 0 took 0.311120 seconds
  epoch 0, avg costs 0.923582
  epoch 0, training error 0.128704
!!!  epoch 0, validation error of best model 0.130167
  epoch 1 took 0.320057 seconds
  epoch 1, avg costs 0.452976
  epoch 1, training error 0.105963
!!!  epoch 1, validation error of best model 0.109500
  epoch 2 took 0.326817 seconds
  epoch 2, avg costs 0.378660
  epoch 2, training error 0.097426
!!!  epoch 2, validation error of best model 0.102000
  epoch 3 took 0.308853 seconds
  epoch 3, avg costs 0.348301
  epoch 3, training error 0.092667
!!!  epoch 3, validation error of best model 0.097333
  epoch 4 took 0.327724 seconds
  epoch 4, avg costs 0.331119
  epoch 4, training error 0.089056
!!!  epoch 4, validation error of best model 0.094667
  epoch 5 took 0.311835 seconds
  epoch 5, avg costs 0.319697
  epoch 5, training error 0.086667
!!!  epoch 5, validation error of best model 0.092833
  epoch 6 took 0.324327 seconds
  epoch 6, avg costs 0.311373
  epoch 6, training error 0.084944
!!!  epoch 6, validation error of best model 0.091500
  epoch 7 took 0.327300 seconds
  epoch 7, avg costs 0.304943
  epoch 7, training error 0.083370
!!!  epoch 7, validation error of best model 0.089667
  epoch 8 took 0.313637 seconds
  epoch 8, avg costs 0.299771
  epoch 8, training error 0.082148
!!!  epoch 8, validation error of best model 0.088500
  epoch 9 took 0.306343 seconds
  epoch 9, avg costs 0.295485
  epoch 9, training error 0.081315
!!!  epoch 9, validation error of best model 0.087833
  epoch 10 took 0.329969 seconds
  epoch 10, avg costs 0.291853
  epoch 10, training error 0.080315
!!!  epoch 10, validation error of best model 0.087667
  epoch 11 took 0.311047 seconds
  epoch 11, avg costs 0.288719
  epoch 11, training error 0.079463
!!!  epoch 11, validation error of best model 0.087333
  epoch 12 took 0.306574 seconds
  epoch 12, avg costs 0.285976
  epoch 12, training error 0.078556
!!!  epoch 12, validation error of best model 0.087000
  epoch 13 took 0.318012 seconds
  epoch 13, avg costs 0.283545
  epoch 13, training error 0.077926
!!!  epoch 13, validation error of best model 0.086500
  epoch 14 took 0.318956 seconds
  epoch 14, avg costs 0.281369
  epoch 14, training error 0.077148
!!!  epoch 14, validation error of best model 0.085833
  epoch 15 took 0.307148 seconds
  epoch 15, avg costs 0.279405
  epoch 15, training error 0.076315
!!!  epoch 15, validation error of best model 0.085667
  epoch 16 took 0.307458 seconds
  epoch 16, avg costs 0.277618
  epoch 16, training error 0.075815
!!!  epoch 16, validation error of best model 0.085333
  epoch 17 took 0.312092 seconds
  epoch 17, avg costs 0.275983
  epoch 17, training error 0.075426
!!!  epoch 17, validation error of best model 0.085000
  epoch 18 took 0.304722 seconds
  epoch 18, avg costs 0.274479
  epoch 18, training error 0.074926
  epoch 19 took 0.311247 seconds
  epoch 19, avg costs 0.273086
  epoch 19, training error 0.074463
  epoch 20 took 0.340049 seconds
  epoch 20, avg costs 0.271793
  epoch 20, training error 0.074241
  epoch 21 took 0.307233 seconds
  epoch 21, avg costs 0.270587
  epoch 21, training error 0.073889
!!!  epoch 21, validation error of best model 0.084833
  epoch 22 took 0.347523 seconds
  epoch 22, avg costs 0.269458
  epoch 22, training error 0.073574
  epoch 23 took 0.324835 seconds
  epoch 23, avg costs 0.268398
  epoch 23, training error 0.073204
!!!  epoch 23, validation error of best model 0.083667
  epoch 24 took 0.360615 seconds
  epoch 24, avg costs 0.267400
  epoch 24, training error 0.072704
!!!  epoch 24, validation error of best model 0.083500
  epoch 25 took 0.377638 seconds
  epoch 25, avg costs 0.266457
  epoch 25, training error 0.072611
  epoch 26 took 0.306555 seconds
  epoch 26, avg costs 0.265566
  epoch 26, training error 0.072370
!!!  epoch 26, validation error of best model 0.083167
  epoch 27 took 0.309302 seconds
  epoch 27, avg costs 0.264720
  epoch 27, training error 0.072185
  epoch 28 took 0.333637 seconds
  epoch 28, avg costs 0.263916
  epoch 28, training error 0.071907
  epoch 29 took 0.324232 seconds
  epoch 29, avg costs 0.263151
  epoch 29, training error 0.071722
score: 0.924300