Ejercicio 4

La idea de este ejercicio es estudiar el fenómeno de overfitting. Supongamos que tenemos una serie de datos que vamos ajustar con una red (una capa hidden). Los datos se encuentran en el archivo datos_guia4_ej4.mat. Nuestro objetivo es determinar el número optimo de neuronas en la capa hidden.

a) Determine una estrategia para dividir los datos en dos conjuntos: {ptrain,ttrain} y {ptest,ttest}

b) Ajuste Nest redes neuronales con Nneuro neuronas en la capa hidden
c) Grafique el error absoluto promedio para los ajustes de entrenamiento {ptrain,ttrain} y para los ajustes con los datos de testeo {ptest,ttest}.
d) Explique el comportamiento de ambos errores.



In [186]:

    
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
%matplotlib inline
plt.rcParams['figure.figsize'] = 8,6 #parámetros de tamaño



In [187]:

    
# Abrimos los archivos de datos
datos = sio.loadmat('datos_guia4_ej4.mat')



In [188]:

    
t = datos['t']
p = datos['p']



In [189]:

    
#size_t = t.shape
#size_p = p.shape
#t = t.reshape(size_t[1])
#p = p.reshape(size_p[1])



In [190]:

    
#plt.plot(p)
#plt.plot(t)



In [191]:

    
# (c) 2014 Reid Johnson
#
# Modified from:
# Daniel Rodriguez (http://danielfrg.com/blog/2013/07/03/basic-neural-network-python/)
#
# Functions to generate an artifical neural network (ANN) with one hidden 
# layer. The ANN is a collection of artificial neurons arranged in layers 
# that are trained using backpropogation.
#
# The perceptron algorithm is defined by Rosenblatt in:
# The Perceptron: A Probalistic Model for Information Storage and Organization in the Brain.
# Psychological Review 65 (6): 386–408. 1958.
#
# The backpropogation algorithm is defined by Werbos in:
# Beyond Regression: New Tools for Prediction and Analysis in the Behavioral Sciences. 1975.


from scipy import optimize
from __future__ import division

class NN_1HL(object):
    def __init__(self, reg_lambda=0, epsilon_init=0.12, hidden_layer_size=25, opti_method='TNC', maxiter=500):
        self.reg_lambda = reg_lambda # weight for the logistic regression cost
        self.epsilon_init = epsilon_init # learning rate
        self.hidden_layer_size = hidden_layer_size # size of the hidden layer
        self.activation_func = self.sigmoid # activation function
        self.activation_func_prime = self.sigmoid_prime # derivative of the activation function
        self.method = opti_method # optimization method
        self.maxiter = maxiter # maximum number of iterations

    def sigmoid(self, z):
        ''' Logistic function.

        Returns: 
          The logistic function.

        '''
        return 1 / (1 + np.exp(-z))

    def sigmoid_prime(self, z):
        ''' Derivative of the logistic function.

        Returns: 
          The derivative of the logistic function.

        '''
        sig = self.sigmoid(z)

        return sig * (1-sig)

    def sumsqr(self, a):
        ''' Sum of squared values.

        Returns: 
          The sum of squared values.

        '''
        return np.sum(a**2)

    def rand_init(self, l_in, l_out):
        ''' Generates an (l_out x l_in+1) array of thetas (threshold 
        values), each initialized to a random number between minus 
        epsilon and epsilon.

        Note that there is one theta matrix per layer. The size of 
        each theta matrix depends on the number of activation units 
        in its corresponding layer, so each matrix may be of a 
        different size.

        Returns:
          Randomly initialized thetas (threshold values).

        '''
        return np.random.rand(l_out, l_in+1) * 2 * self.epsilon_init - self.epsilon_init

    # Pack thetas (threshold values) into a one-dimensional array.
    def pack_thetas(self, t1, t2):
        ''' Packs (unrolls) thetas (threshold values) from matrices 
        into a single vector.

        Note that there is one theta matrix per layer. To use an 
        optimization technique that minimizes the error, we need to 
        pack (unroll) the matrices into a single vector.

        Args:
          t1 (array): Unpacked (rolled) thetas (threshold values) between the input layer and hidden layer.
          t2 (array): Unpacked (rolled) thetas (threshold values) between the hidden layer and output layer.

        Returns:
          Packed (unrolled) thetas (threshold values).

        '''
        return np.concatenate((t1.reshape(-1), t2.reshape(-1)))

    def unpack_thetas(self, thetas, input_layer_size, hidden_layer_size, num_labels):
        ''' Unpacks (rolls) thetas (threshold values) from a single
        vector into a multi-dimensional array (matrices).

         Args:
          thetas (vector): Packed (unrolled) thetas (threshold values).
          input_layer_size (int): Number of nodes in the input layer.
          hidden_layer_size (int): Number of nodes in the hidden layer.
          num_labels (int): Number of classes.

        Returns:
          t1 (array): Unpacked (rolled) thetas (threshold values) between the input layer and hidden layer.
          t2 (array): Unpacked (rolled) thetas (threshold values) between the hidden layer and output layer.

        '''
        t1_start = 0
        t1_end = hidden_layer_size * (input_layer_size + 1)
        t1 = thetas[t1_start:t1_end].reshape((hidden_layer_size, input_layer_size + 1))
        t2 = thetas[t1_end:].reshape((num_labels, hidden_layer_size + 1))

        return t1, t2

    def _forward(self, X, t1, t2):
        ''' Forward propogation.

        Args:
          X ((n,m) array): The feature data for which to compute the predicted output probabilities.
          t1 (array): Unpacked (rolled) thetas (threshold values) between the input layer and hidden layer.
          t2 (array): Unpacked (rolled) thetas (threshold values) between the hidden layer and output layer.

        Returns:
          a1: The output activation of units in the input layer.
          z2: The input activation of units in the hidden layer.
          a2: The output activation of units in the hidden layer.
          z3: The input activation of units in the output layer.
          a3: The output activation of units in the output layer.

        '''
        m = X.shape[0]
        ones = None
        if len(X.shape) == 1:
            ones = np.array(1).reshape(1,)
        else:
            ones = np.ones(m).reshape(m,1)

        # Input layer.
        a1 = np.hstack((ones, X))

        # Hidden Layer.
        z2 = np.dot(t1, a1.T)
        a2 = self.activation_func(z2)
        a2 = np.hstack((ones, a2.T))

        # Output layer.
        z3 = np.dot(t2, a2.T)
        a3 = self.activation_func(z3)

        return a1, z2, a2, z3, a3

    def function(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
        ''' Generates the cost using a generalization of the regularized logistic regression cost function. 

        Returns:
          J: the cost vector for each output unit.

        '''
        t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)

        m = X.shape[0]
        Y = np.eye(num_labels)[y]

        _, _, _, _, h = self._forward(X, t1, t2)
        costPositive = -Y * np.log(h).T # the cost when y is 1
        costNegative = (1 - Y) * np.log(1 - h).T # the cost when y is 0
        cost = costPositive - costNegative # the total cost
        J = np.sum(cost) / m # the (unregularized) cost function

        # For regularization.
        if reg_lambda != 0:
            t1f = t1[:, 1:]
            t2f = t2[:, 1:]
            reg = (self.reg_lambda / (2*m)) * (self.sumsqr(t1f) + self.sumsqr(t2f)) # regularization term
            J = J + reg # the regularized cost function

        return J

    def function_prime(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
        ''' Generates the objective function to minimize based upon the cost function. 

        Returns:
          Packed (unrolled) gradients for each theta.

          '''
        t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)

        m = X.shape[0]
        t1f = t1[:, 1:] # threshold values between the input layer and hidden layer (excluding the bias input)
        t2f = t2[:, 1:] # threshold values between the hidden layer and output layer (excluding the bias input)
        Y = np.eye(num_labels)[y]

        Delta1, Delta2 = 0, 0 # initialize matrix Deltas (cost function gradients)
        # Iterate over the instances.
        for i, row in enumerate(X):
            a1, z2, a2, z3, a3 = self._forward(row, t1, t2)

            # Backprop.
            d3 = a3 - Y[i, :].T # activation error (delta) in the output layer nodes
            d2 = np.dot(t2f.T, d3) * self.activation_func_prime(z2) # activation error (delta) in the hidden layer nodes

            # Update matrix Deltas (cost function gradients).
            Delta2 += np.dot(d3[np.newaxis].T, a2[np.newaxis])
            Delta1 += np.dot(d2[np.newaxis].T, a1[np.newaxis])

        # The (unregularized) gradients for each theta.
        Theta1_grad = (1 / m) * Delta1 
        Theta2_grad = (1 / m) * Delta2

        # For regularization.
        if reg_lambda != 0:
            # The regularized gradients for each theta (excluding the bias input).
            Theta1_grad[:, 1:] = Theta1_grad[:, 1:] + (reg_lambda / m) * t1f
            Theta2_grad[:, 1:] = Theta2_grad[:, 1:] + (reg_lambda / m) * t2f

        return self.pack_thetas(Theta1_grad, Theta2_grad)

    def fit(self, X, y):
        ''' Fit the model given predictor(s) X and target y.

        Args:
          X ((n,m) array): The feature data for which to compute the predicted output.
          y ((n,1) array): The actual outputs (class data).

        '''
        num_features = X.shape[1]
        input_layer_size = X.shape[0]
        num_labels = int(len(y))

        theta1_0 = self.rand_init(input_layer_size, self.hidden_layer_size)
        theta2_0 = self.rand_init(self.hidden_layer_size, num_labels)
        thetas0 = self.pack_thetas(theta1_0, theta2_0)

        options = {'maxiter': self.maxiter}
        _res = optimize.minimize(self.function, thetas0, jac=self.function_prime, method=self.method, 
                                 args=(input_layer_size, self.hidden_layer_size, num_labels, X, y, 0), options=options)

        self.t1, self.t2 = self.unpack_thetas(_res.x, input_layer_size, self.hidden_layer_size, num_labels)

    def predict(self, X):
        ''' Predict labels with the fitted model on predictor(s) X.

        Args:
          X ((n,m) array): The feature data for which to compute the predicted outputs.

        Returns:
          The predicted labels for each instance X.
    
        '''
        return self.predict_proba(X).argmax(0)

    def predict_proba(self, X):
        ''' Predict label probabilities with the fitted model on predictor(s) X.

        The probabilities are computed as the output activation of units in the output layer.

        Args:
          X ((n,m) array): The feature data for which to compute the predicted output probabilities.

        Returns:
          h: The predicted label probabilities for each instance X.

        '''
        _, _, _, _, h = self._forward(X, self.t1, self.t2)

        return h



In [192]:

    
nn = NN_1HL(hidden_layer_size=2)



In [193]:

    
nn.fit(p.T,t.T)









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-193-d3c481ff68bf> in <module>()
----> 1 nn.fit(p.T,t.T)

<ipython-input-191-7c025a046de9> in fit(self, X, y)
    236         options = {'maxiter': self.maxiter}
    237         _res = optimize.minimize(self.function, thetas0, jac=self.function_prime, method=self.method, 
--> 238                                  args=(input_layer_size, self.hidden_layer_size, num_labels, X, y, 0), options=options)
    239 
    240         self.t1, self.t2 = self.unpack_thetas(_res.x, input_layer_size, self.hidden_layer_size, num_labels)

/usr/local/lib/python2.7/dist-packages/scipy/optimize/_minimize.pyc in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    428     elif meth == 'tnc':
    429         return _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,
--> 430                              **options)
    431     elif meth == 'cobyla':
    432         return _minimize_cobyla(fun, x0, args, constraints, **options)

/usr/local/lib/python2.7/dist-packages/scipy/optimize/tnc.pyc in _minimize_tnc(fun, x0, args, jac, bounds, eps, scale, offset, mesg_num, maxCGit, maxiter, eta, stepmx, accuracy, minfev, ftol, xtol, gtol, rescale, disp, callback, **unknown_options)
    396                                         offset, messages, maxCGit, maxfun,
    397                                         eta, stepmx, accuracy, fmin, ftol,
--> 398                                         xtol, pgtol, rescale, callback)
    399 
    400     funv, jacv = func_and_grad(x)

/usr/local/lib/python2.7/dist-packages/scipy/optimize/tnc.pyc in func_and_grad(x)
    358     else:
    359         def func_and_grad(x):
--> 360             f = fun(x, *args)
    361             g = jac(x, *args)
    362             return f, g

<ipython-input-191-7c025a046de9> in function(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda)
    162 
    163         m = X.shape[0]
--> 164         Y = np.eye(num_labels)[y]
    165 
    166         _, _, _, _, h = self._forward(X, t1, t2)

IndexError: arrays used as indices must be of integer (or boolean) type



In [ ]:

    
p.T.shape



In [ ]:

    
t.T.shape



In [ ]:

    
len(t.T)



In [ ]:

    
t.shape



In [ ]:

    
type(t)



In [ ]:

    
a = p.T.shape



In [ ]:

    
a[0]



In [ ]: