## CS224d Assignment 1

Q1, Q2 Softmax Layer Gradient Checking Neural network Sigmoid Layer

``````

In [ ]:

import numpy as np
import random
import glob
import os.path as op
import cPickle as pickle

``````

q1_softmax.py

``````

In [ ]:

def softmax(x):
"""
Compute the softmax function for each row of the input x.

It is crucial that this function is optimized for speed because
it will be used frequently in later code.
You might find numpy functions np.exp, np.sum, np.reshape,

You should also make sure that your code works for one
dimensional inputs (treat the vector as a row), you might find

You must implement the optimization in problem 1(a) of the
written assignment!
"""

# here we subtract max from each row to make the computation more robust, it doesn't affect overall answer

if(len(x.shape)==1):
x = (np.exp(x - np.max(x)))/(np.sum(np.exp(x - np.max(x))))
else:
rows = x.shape[0]
x = (np.exp(x - np.max(x,axis=1).reshape(rows,1)))/(np.sum(np.exp(x - np.max(x,axis=1).reshape(rows,1)),axis=1).reshape((rows,1)))

return x

``````

``````

In [ ]:

"""
Gradient check for a function f
- f should be a function that takes a single argument and outputs the cost and its gradients
- x is the point (numpy array) to check the gradient at
"""

rndstate = random.getstate()
random.setstate(rndstate)
fx, grad = f(x) # Evaluate function value at original point
h = 1e-4

# Iterate over all indexes in x
while not it.finished:
ix = it.multi_index

### try modifying x[ix] with h defined above to compute numerical gradients
### make sure you call random.setstate(rndstate) before calling f(x) each time, this will make it
### possible to test cost functions with built in randomness later

old = x[ix]
x[ix] = old + h
random.setstate(rndstate)
x[ix] = old - h
random.setstate(rndstate)
numgrad = (fxh1 - fxh2)/(2*h) # classic way of finding gradient for any function

x[ix]=old

if reldiff > 1e-5:
print "First gradient error found at index %s" % str(ix)
print "reldiff: %f Needed: %f" % (reldiff,1e-5)
return

it.iternext() # Step to next dimension

``````

q2_sigmoid.py

``````

In [ ]:

def sigmoid(x):
"""
Compute the sigmoid function for the input here.
"""

x = 1.0/(1.0 + np.exp(-x))

return x

"""
Compute the gradient for the sigmoid function here. Note that
for this implementation, the input f should be the sigmoid
function value of your original input x.
"""

f = f*(1-f)

return f

``````

q2_neural.py

``````

In [ ]:

def forward_backward_prop(data, labels, params, dimensions):
"""
Forward and backward propagation for a two-layer sigmoidal network

Compute the forward propagation and for the cross entropy cost,
and backward propagation for the gradients for all parameters.
"""

### Unpack network parameters (do not modify)
ofs = 0
Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])
# [10,5,10] N = 20
W1 = np.reshape(params[ofs:ofs+ Dx * H], (Dx, H))   # W1 (10,5)
ofs += Dx * H
b1 = np.reshape(params[ofs:ofs + H], (1, H))        # b1 (1,5)
ofs += H
W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))  # W2 (5,10)
ofs += H * Dy
b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))      # b2 (1,10)

### YOUR CODE HERE: forward propagation
z1 = np.dot(data,W1) + b1                           # data(20,10) * W1(10,5) + b1(1,5)= z1(20,5)
h = sigmoid(z1)

z2 = np.dot(h,W2) + b2                              # h(20,5) * W2 (5,10) + b2(1,10) = z2(20,10)
y = softmax(z2)

cost = np.sum(-np.log(y[labels==1]))   # We only need to add those balues of y corresponding to 1's in labels
### YOUR CODE HERE: backward propagation
delta = y - labels

### Stack gradients (do not modify)

``````
``````

In [ ]:

SAVE_PARAMS_EVERY = 1000

``````

q3_sgd.py

``````

In [ ]:

""" A helper function that loads previously saved parameters and resets iteration start """
st = 0
for f in glob.glob("saved_params_*.npy"):
iter = int(op.splitext(op.basename(f))[0].split("_")[2])
if (iter > st):
st = iter

if st > 0:
with open("saved_params_%d.npy" % st, "r") as f:
return st, params, state
else:
return st, None, None

def save_params(iter, params):
with open("saved_params_%d.npy" % iter, "w") as f:
pickle.dump(params, f)
pickle.dump(random.getstate(), f)

def sgd(f, x0, step, iterations, postprocessing = None, useSaved = False, PRINT_EVERY=10):
# Implement the stochastic gradient descent method in this
# function.

# Inputs:
# - f: the function to optimize, it should take a single
#     argument and yield two outputs, a cost and the gradient
#     with respect to the arguments
# - x0: the initial point to start SGD from
# - step: the step size for SGD
# - iterations: total iterations to run SGD for
# - postprocessing: postprocessing function for the parameters
#     if necessary. In the case of word2vec we will need to
#     normalize the word vectors to have unit length.
# - PRINT_EVERY: specifies every how many iterations to output

# Output:
# - x: the parameter value after SGD finishes

# Anneal learning rate every several iterations
ANNEAL_EVERY = 20000

if useSaved:
if start_iter > 0:
x0 = oldx;
step *= 0.5 ** (start_iter / ANNEAL_EVERY)

if state:
random.setstate(state)
else:
start_iter = 0

x = x0

if not postprocessing:
postprocessing = lambda x: x

expcost = None

for iter in xrange(start_iter + 1, iterations + 1):
### Don't forget to apply the postprocessing after every iteration!
### You might want to print the progress every few iterations.

cost = None
x = postprocessing(x)

if iter % PRINT_EVERY == 0:
if not expcost:
expcost = cost
else:
expcost = .95 * expcost + .05 * cost
print "iter %d: %f" % (iter, expcost)

if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
save_params(iter, x)

if iter % ANNEAL_EVERY == 0:
step *= 0.5

return x

``````

q3_word2vec.py

``````

In [ ]:

def normalizeRows(x):
""" Row normalization function """
# Implement a function that normalizes each row of a matrix to have unit length