In [1]:
%pylab inline
import numpy as np
from cs224d.data_utils import *
%load_ext autoreload
%autoreload 2
In [2]:
def softmax(x):
"""
Compute the softmax function for each row of the input x.
It is crucial that this function is optimized for speed because
it will be used frequently in later code.
You might find numpy functions np.exp, np.sum, np.reshape,
np.max, and numpy broadcasting useful for this task. (numpy
broadcasting documentation:
http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
You should also make sure that your code works for one
dimensional inputs (treat the vector as a row), you might find
it helpful for your later problems.
You must implement the optimization in problem 1(a) of the
written assignment!
"""
### YOUR CODE HERE
try:
c = np.max(x, axis = 1)
expx = np.exp((x.T - c).T)
x = expx / np.sum(expx, axis = 1).reshape(len(expx),1)
except:
c = np.max(x)
expx = np.exp(x - c)
x = expx / np.sum(expx)
### END YOUR CODE
return x
In [3]:
def test_softmax_basic():
"""
Some simple tests to get you started.
Warning: these are not exhaustive.
"""
print("Running basic tests...")
test1 = softmax(np.array([1,2]))
print(test1)
assert np.amax(np.fabs(test1 - np.array(
[0.26894142, 0.73105858]))) <= 1e-6
test2 = softmax(np.array([[1001,1002],[3,4]]))
print(test2)
assert np.amax(np.fabs(test2 - np.array(
[[0.26894142, 0.73105858], [0.26894142, 0.73105858]]))) <= 1e-6
test3 = softmax(np.array([[-1001,-1002]]))
print(test3)
assert np.amax(np.fabs(test3 - np.array(
[0.73105858, 0.26894142]))) <= 1e-6
print("You should verify these results!\n")
In [4]:
test_softmax_basic()
In [5]:
def sigmoid(x):
"""
Compute the sigmoid function for the input here.
"""
x = 1./(1 + np.exp(-x))
return x
def sigmoid_grad(f):
"""
Compute the gradient for the sigmoid function here. Note that
for this implementation, the input f should be the sigmoid
function value of your original input x.
"""
f = f*(1-f)
return f
In [6]:
def test_sigmoid_basic():
"""
Some simple tests to get you started.
Warning: these are not exhaustive.
"""
# print "Running basic tests..."
x = np.array([[1, 2], [-1, -2]])
f = sigmoid(x)
g = sigmoid_grad(f)
print (f)
assert np.amax(f - np.array([[0.73105858, 0.88079708],
[0.26894142, 0.11920292]])) <= 1e-6
print (g)
assert np.amax(g - np.array([[0.19661193, 0.10499359],
[0.19661193, 0.10499359]])) <= 1e-6
print("You should verify these results!\n")
In [7]:
test_sigmoid_basic()
In [8]:
def gradcheck_naive(f, x):
"""
Gradient check for a function f
- f should be a function that takes a single argument and outputs the cost and its gradients
- x is the point (numpy array) to check the gradient at
"""
rndstate = random.getstate()
random.setstate(rndstate)
fx, grad = f(x) # Evaluate function value at original point
h = 1e-4
# Iterate over all indexes in x
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
ix = it.multi_index
### try modifying x[ix] with h defined above to compute numerical gradients
### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it
### possible to test cost functions with built in randomness later
### YOUR CODE HERE:
x_b = x.copy()
x_a = x.copy()
x_b[ix] = x[ix] + h
x_a[ix] = x[ix] - h
random.setstate(rndstate)
fx_b, grad_new = f(x_b)
random.setstate(rndstate)
fx_a, grad_new = f(x_a)
numgrad = (fx_b - fx_a) / (2 * h)
### END YOUR CODE
# Compare gradients
reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
if reldiff > 1e-5:
print ("Gradient check failed.")
print ("First gradient error found at index %s" % str(ix))
print ("Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad))
return
it.iternext() # Step to next dimension
print("Gradient check passed!")
In [9]:
def sanity_check():
"""
Some basic sanity checks.
"""
quad = lambda x: (np.sum(x ** 2), x * 2)
print("Running sanity checks...")
gradcheck_naive(quad, np.array(123.456)) # scalar test
gradcheck_naive(quad, np.random.randn(3,)) # 1-D test
gradcheck_naive(quad, np.random.randn(4,5)) # 2-D test
print ("")
In [10]:
sanity_check()
In [11]:
def forward_backward_prop(data, labels, params, dimensions):
"""
Forward and backward propagation for a two-layer sigmoidal network
Compute the forward propagation and for the cross entropy cost,
and backward propagation for the gradients for all parameters.
"""
### Unpack network parameters (do not modify)
ofs = 0
Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])
W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))
ofs += Dx * H
b1 = np.reshape(params[ofs:ofs + H], (1, H))
ofs += H
W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
ofs += H * Dy
b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))
N = data.shape[0]
h = sigmoid(data.dot(W1) + b1)
y = softmax(h.dot(W2) + b2)
cost = - np.sum(np.log(y[labels == 1]))/N
yp = ( y - labels ) / N
gradW2 = h.T.dot(yp)
gradb2 = np.sum(yp, axis=0)
hp = yp.dot(W2.T) * sigmoid_grad(h)
gradW1 = data.T.dot(hp)
gradb1 = np.sum(hp, axis=0)
### Stack gradients (do not modify)
grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
gradW2.flatten(), gradb2.flatten()))
return cost, grad
In [12]:
N = 20
dimensions = [10, 5, 10]
data = np.random.randn(N, dimensions[0]) # each row will be a datum
labels = np.zeros((N, dimensions[2]))
for i in range(N):
labels[i,random.randint(0,dimensions[2]-1)] = 1
params = np.random.randn((dimensions[0] + 1) * dimensions[1] + (
dimensions[1] + 1) * dimensions[2], )
In [ ]:
In [13]:
gradcheck_naive(lambda params: forward_backward_prop(data, labels, params,
dimensions), params)
In [ ]:
In [ ]: