In [1]:
%pylab inline
import numpy as np
from cs224d.data_utils import *
%load_ext autoreload
%autoreload 2


Populating the interactive namespace from numpy and matplotlib

q1 softmax


In [2]:
def softmax(x):
    """
    Compute the softmax function for each row of the input x.

    It is crucial that this function is optimized for speed because
    it will be used frequently in later code.
    You might find numpy functions np.exp, np.sum, np.reshape,
    np.max, and numpy broadcasting useful for this task. (numpy
    broadcasting documentation:
    http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)

    You should also make sure that your code works for one
    dimensional inputs (treat the vector as a row), you might find
    it helpful for your later problems.

    You must implement the optimization in problem 1(a) of the 
    written assignment!
    """

    ### YOUR CODE HERE
    try:
        c = np.max(x, axis = 1)
        expx = np.exp((x.T - c).T)
        x = expx / np.sum(expx, axis = 1).reshape(len(expx),1)
    except:
        c = np.max(x)
        expx = np.exp(x - c)
        x = expx / np.sum(expx)
    ### END YOUR CODE
    
    return x

In [3]:
def test_softmax_basic():
    """
    Some simple tests to get you started. 
    Warning: these are not exhaustive.
    """
    print("Running basic tests...")
    test1 = softmax(np.array([1,2]))
    print(test1)
    assert np.amax(np.fabs(test1 - np.array(
        [0.26894142,  0.73105858]))) <= 1e-6

    test2 = softmax(np.array([[1001,1002],[3,4]]))
    print(test2)
    assert np.amax(np.fabs(test2 - np.array(
        [[0.26894142, 0.73105858], [0.26894142, 0.73105858]]))) <= 1e-6

    test3 = softmax(np.array([[-1001,-1002]]))
    print(test3)
    assert np.amax(np.fabs(test3 - np.array(
        [0.73105858, 0.26894142]))) <= 1e-6

    print("You should verify these results!\n")

In [4]:
test_softmax_basic()


Running basic tests...
[ 0.26894142  0.73105858]
[[ 0.26894142  0.73105858]
 [ 0.26894142  0.73105858]]
[[ 0.73105858  0.26894142]]
You should verify these results!

q2


In [5]:
def sigmoid(x):
    """
    Compute the sigmoid function for the input here.
    """
    
    x = 1./(1 + np.exp(-x))
    
    return x

def sigmoid_grad(f):
    """
    Compute the gradient for the sigmoid function here. Note that
    for this implementation, the input f should be the sigmoid
    function value of your original input x. 
    """
    
    f = f*(1-f)
    
    return f

In [6]:
def test_sigmoid_basic():
    """
    Some simple tests to get you started. 
    Warning: these are not exhaustive.
    """
    # print "Running basic tests..."
    x = np.array([[1, 2], [-1, -2]])
    f = sigmoid(x)
    g = sigmoid_grad(f)
    print (f)
    assert np.amax(f - np.array([[0.73105858, 0.88079708], 
        [0.26894142, 0.11920292]])) <= 1e-6
    print (g)
    assert np.amax(g - np.array([[0.19661193, 0.10499359],
        [0.19661193, 0.10499359]])) <= 1e-6
    print("You should verify these results!\n")

In [7]:
test_sigmoid_basic()


[[ 0.73105858  0.88079708]
 [ 0.26894142  0.11920292]]
[[ 0.19661193  0.10499359]
 [ 0.19661193  0.10499359]]
You should verify these results!

q2 gradcheck


In [8]:
def gradcheck_naive(f, x):
    """ 
    Gradient check for a function f 
    - f should be a function that takes a single argument and outputs the cost and its gradients
    - x is the point (numpy array) to check the gradient at
    """ 

    rndstate = random.getstate()
    random.setstate(rndstate)  
    fx, grad = f(x) # Evaluate function value at original point
    h = 1e-4

    # Iterate over all indexes in x
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        ix = it.multi_index

        ### try modifying x[ix] with h defined above to compute numerical gradients
        ### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it 
        ### possible to test cost functions with built in randomness later

        ### YOUR CODE HERE:
        x_b = x.copy()
        x_a = x.copy()
        
        x_b[ix] = x[ix] + h
        x_a[ix] = x[ix] - h
        
        random.setstate(rndstate) 
        fx_b, grad_new = f(x_b)
        random.setstate(rndstate) 
        fx_a, grad_new = f(x_a)
        
        numgrad  = (fx_b - fx_a) / (2 * h)
        ### END YOUR CODE

        # Compare gradients
        reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
        if reldiff > 1e-5:
            print ("Gradient check failed.")
            print ("First gradient error found at index %s" % str(ix))
            print ("Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad))
            return
    
        it.iternext() # Step to next dimension
        
    print("Gradient check passed!")

In [9]:
def sanity_check():
    """
    Some basic sanity checks.
    """
    quad = lambda x: (np.sum(x ** 2), x * 2)

    print("Running sanity checks...")
    gradcheck_naive(quad, np.array(123.456))      # scalar test
    gradcheck_naive(quad, np.random.randn(3,))    # 1-D test
    gradcheck_naive(quad, np.random.randn(4,5))   # 2-D test
    print ("")

In [10]:
sanity_check()


Running sanity checks...
Gradient check passed!
Gradient check passed!
Gradient check passed!

q2 neural


In [11]:
def forward_backward_prop(data, labels, params, dimensions):
    """ 
    Forward and backward propagation for a two-layer sigmoidal network 
    
    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.
    """

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    N = data.shape[0]

    h = sigmoid(data.dot(W1) + b1)
    y = softmax(h.dot(W2) + b2)

    cost = - np.sum(np.log(y[labels == 1]))/N
    

    yp = ( y - labels ) / N
    gradW2 = h.T.dot(yp)
    gradb2 = np.sum(yp, axis=0)

    hp = yp.dot(W2.T) * sigmoid_grad(h)
    gradW1 = data.T.dot(hp)
    gradb1 = np.sum(hp, axis=0)
    
    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), 
        gradW2.flatten(), gradb2.flatten()))
    
    return cost, grad

In [12]:
N = 20
dimensions = [10, 5, 10]
data = np.random.randn(N, dimensions[0])   # each row will be a datum
labels = np.zeros((N, dimensions[2]))
for i in range(N):
    labels[i,random.randint(0,dimensions[2]-1)] = 1

params = np.random.randn((dimensions[0] + 1) * dimensions[1] + (
    dimensions[1] + 1) * dimensions[2], )

In [ ]:


In [13]:
gradcheck_naive(lambda params: forward_backward_prop(data, labels, params,
    dimensions), params)


Gradient check passed!

In [ ]:


In [ ]: