In [ ]:
import numpy as np
import random
import glob
import os.path as op
import cPickle as pickle
q1_softmax.py
In [ ]:
def softmax(x):
"""
Compute the softmax function for each row of the input x.
It is crucial that this function is optimized for speed because
it will be used frequently in later code.
You might find numpy functions np.exp, np.sum, np.reshape,
np.max, and numpy broadcasting useful for this task. (numpy
broadcasting documentation:
http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
You should also make sure that your code works for one
dimensional inputs (treat the vector as a row), you might find
it helpful for your later problems.
You must implement the optimization in problem 1(a) of the
written assignment!
"""
### YOUR CODE HERE
# here we subtract max from each row to make the computation more robust, it doesn't affect overall answer
if(len(x.shape)==1):
x = (np.exp(x - np.max(x)))/(np.sum(np.exp(x - np.max(x))))
else:
rows = x.shape[0]
x = (np.exp(x - np.max(x,axis=1).reshape(rows,1)))/(np.sum(np.exp(x - np.max(x,axis=1).reshape(rows,1)),axis=1).reshape((rows,1)))
### END YOUR CODE
return x
q2_gradcheck.py
In [ ]:
def gradcheck_naive(f, x):
"""
Gradient check for a function f
- f should be a function that takes a single argument and outputs the cost and its gradients
- x is the point (numpy array) to check the gradient at
"""
rndstate = random.getstate()
random.setstate(rndstate)
fx, grad = f(x) # Evaluate function value at original point
h = 1e-4
# Iterate over all indexes in x
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
ix = it.multi_index
### try modifying x[ix] with h defined above to compute numerical gradients
### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it
### possible to test cost functions with built in randomness later
### YOUR CODE HERE:
old = x[ix]
x[ix] = old + h
random.setstate(rndstate)
fxh1,grad1 = f(x)
x[ix] = old - h
random.setstate(rndstate)
fxh2,grad2 = f(x)
numgrad = (fxh1 - fxh2)/(2*h) # classic way of finding gradient for any function
x[ix]=old
### END YOUR CODE
# Compare gradients
reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
if reldiff > 1e-5:
print "Gradient check failed."
print "First gradient error found at index %s" % str(ix)
print "Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numgrad)
print "reldiff: %f Needed: %f" % (reldiff,1e-5)
return
it.iternext() # Step to next dimension
print "Gradient check passed!"
q2_sigmoid.py
In [ ]:
def sigmoid(x):
"""
Compute the sigmoid function for the input here.
"""
### YOUR CODE HERE
x = 1.0/(1.0 + np.exp(-x))
### END YOUR CODE
return x
def sigmoid_grad(f):
"""
Compute the gradient for the sigmoid function here. Note that
for this implementation, the input f should be the sigmoid
function value of your original input x.
"""
### YOUR CODE HERE
f = f*(1-f)
### END YOUR CODE
return f
q2_neural.py
In [ ]:
def forward_backward_prop(data, labels, params, dimensions):
"""
Forward and backward propagation for a two-layer sigmoidal network
Compute the forward propagation and for the cross entropy cost,
and backward propagation for the gradients for all parameters.
"""
### Unpack network parameters (do not modify)
ofs = 0
Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])
# [10,5,10] N = 20
W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H)) # W1 (10,5)
ofs += Dx * H
b1 = np.reshape(params[ofs:ofs + H], (1, H)) # b1 (1,5)
ofs += H
W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy)) # W2 (5,10)
ofs += H * Dy
b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy)) # b2 (1,10)
### YOUR CODE HERE: forward propagation
z1 = np.dot(data,W1) + b1 # data(20,10) * W1(10,5) + b1(1,5)= z1(20,5)
h = sigmoid(z1)
z2 = np.dot(h,W2) + b2 # h(20,5) * W2 (5,10) + b2(1,10) = z2(20,10)
y = softmax(z2)
### END YOUR CODE
cost = np.sum(-np.log(y[labels==1])) # We only need to add those balues of y corresponding to 1's in labels
### YOUR CODE HERE: backward propagation
delta = y - labels
gradb2 = np.sum(delta,axis=0)
gradW2 = np.dot(h.transpose(),delta)
delta1 = np.multiply(sigmoid_grad(h), np.dot(delta, W2.transpose()))
gradb1 = np.sum(delta1,axis=0)
gradW1 = np.dot(data.transpose(),delta1)
### END YOUR CODE
### Stack gradients (do not modify)
grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
gradW2.flatten(), gradb2.flatten()))
return cost, grad
In [ ]:
SAVE_PARAMS_EVERY = 1000
q3_sgd.py
In [ ]:
def load_saved_params():
""" A helper function that loads previously saved parameters and resets iteration start """
st = 0
for f in glob.glob("saved_params_*.npy"):
iter = int(op.splitext(op.basename(f))[0].split("_")[2])
if (iter > st):
st = iter
if st > 0:
with open("saved_params_%d.npy" % st, "r") as f:
params = pickle.load(f)
state = pickle.load(f)
return st, params, state
else:
return st, None, None
def save_params(iter, params):
with open("saved_params_%d.npy" % iter, "w") as f:
pickle.dump(params, f)
pickle.dump(random.getstate(), f)
def sgd(f, x0, step, iterations, postprocessing = None, useSaved = False, PRINT_EVERY=10):
""" Stochastic Gradient Descent """
# Implement the stochastic gradient descent method in this
# function.
# Inputs:
# - f: the function to optimize, it should take a single
# argument and yield two outputs, a cost and the gradient
# with respect to the arguments
# - x0: the initial point to start SGD from
# - step: the step size for SGD
# - iterations: total iterations to run SGD for
# - postprocessing: postprocessing function for the parameters
# if necessary. In the case of word2vec we will need to
# normalize the word vectors to have unit length.
# - PRINT_EVERY: specifies every how many iterations to output
# Output:
# - x: the parameter value after SGD finishes
# Anneal learning rate every several iterations
ANNEAL_EVERY = 20000
if useSaved:
start_iter, oldx, state = load_saved_params()
if start_iter > 0:
x0 = oldx;
step *= 0.5 ** (start_iter / ANNEAL_EVERY)
if state:
random.setstate(state)
else:
start_iter = 0
x = x0
if not postprocessing:
postprocessing = lambda x: x
expcost = None
for iter in xrange(start_iter + 1, iterations + 1):
### Don't forget to apply the postprocessing after every iteration!
### You might want to print the progress every few iterations.
cost = None
### YOUR CODE HERE
cost, grad = f(x)
x = x - step*grad
x = postprocessing(x)
### END YOUR CODE
if iter % PRINT_EVERY == 0:
if not expcost:
expcost = cost
else:
expcost = .95 * expcost + .05 * cost
print "iter %d: %f" % (iter, expcost)
if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
save_params(iter, x)
if iter % ANNEAL_EVERY == 0:
step *= 0.5
return x
q3_word2vec.py
In [ ]:
def normalizeRows(x):
""" Row normalization function """
# Implement a function that normalizes each row of a matrix to have unit length
### YOUR CODE HERE
x = x / (np.sqrt(np.sum(x**2, axis=1,keepdims=True)))
### END YOUR CODE
return x