This is a cs231n module in understanding neural networks. Andrej Karapathy says that backprop ultimately is a leaky abstraction and without understanding how it works you can't truly mitigate yourself from the various harms you may cause yourself . Like Vanishing Gradients, Exploding Gradients and Dead Relu's .
This is again my attempt to implement a neural network to understand backprop and truly appreciate it
In [38]:
#Generating Data
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
In [39]:
N = 100 #The number of "training examples" per class
D = 2 # The dimension i.e i think the feature set
K = 3 # The number of classes to classify into
X = np.zeros((N*K, D)) # The data matrix , each row is a single example
# In our case 100*3 amount of examples and each example has two dimensions
# so the Matrix is 300 * 2
Y = np.zeros(N*K , dtype = 'uint8') #Class Labels
for j in range(K):
ix = range (N*j,N*(j+1)) #filling the matrices made above with datapoints
r = np.linspace(0.0,1,N) # radius
t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N) * 0.2 #theta
#numpy.linspace(start, stop, num = 50, endpoint = True, retstep = False, dtype = None)
#: Returns number spaces evenly w.r.t interval.
X[ix] = np.c_[r*np.sin(t),r*np.cos(t)]
# the above line is written so as to give the dataset a spiral structure
# which will be clear in the dataset plot below
# i think this can be played with different kinds of datasets and check how our neural network reacts
# np.c_ slices objects to concatenation along the second axis.
Y[ix] = j
plt.scatter(X[:, 0], X[:, 1], c=Y, s=40, cmap = plt.cm.Spectral)
plt.xlim([-1,1])
plt.ylim([-1,1])
plt.show()
In [40]:
# initialize parameters randomly
W = 0.01 * np.random.randn(D,K) #randn(D,K) D * K Matrix
# in our case 2 * 3 Matrix
b = np.zeros((1,K)) # in our case 1 * 3
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
In [41]:
# compute class scores for a linear classif
scores = np.dot(X,W) + b #scores will be 300 *3 and remember numpy broadcasting for b
In [42]:
# Using cross - entropy loss
# Softmax loss = data loss + regularization loss
# Study about the Softmax loss and how it works before seeing how it works
#in this case it could be intuitive
num_examples = X.shape[0]
# get unnormalized probabilities
exp_scores = np.exp(scores)
# normalize them for each example
'''We now have an array probs of size [300 x 3], where each row now contains the class
probabilities. In particular, since we’ve normalized them every row now sums to one. We
can now query for the log probabilities assigned to the correct classes in each example:'''
probs = exp_scores / np.sum(exp_scores, axis=1 , keepdims=True)
# Calculating the correct_log probability
'''The array correct_logprobs is a 1D array of just the probabilities assigned to the
correct classes for each example. The full loss is then the average of these log
probabilities and the regularization loss:'''
correct_logprobs = -np.log(probs[range(num_examples),y])
In [43]:
# compute ths loss : average cross-entropy loss and regularization
data_loss = np.sum(correct_logprobs)/ num_examples
reg_loss = 0.5*reg*np.sum(W*W)
In [44]:
'''∂Li∂fk=pk−1(yi=k)
Notice how elegant and simple this expression is. Suppose the probabilities we
computed were p = [0.2, 0.3, 0.5], and that the correct class was the middle
one (with probability 0.3). According to this derivation the gradient on the
scores would be df = [0.2, -0.7, 0.5]. Recalling what the interpretation of the
gradient, we see that this result is highly intuitive: increasing the first or
last element of the score vector f (the scores of the incorrect classes) leads to
an increased loss (due to the positive signs +0.2 and +0.5) - and increasing the
loss is bad, as expected. However, increasing the score of the correct class has
negative influence on the loss. The gradient of -0.7 is telling us that increasing
the correct class score would lead to a decrease of the loss Li, which makes sense.'''
dscores = probs
dscores[range(num_examples),y]-=1
dscores/=num_examples
dW = np.dot(X.T,dscores)
db = np.sum(dscores, axis=0, keepdims=True)
dW += reg*W # Not forgetting the regularization gradient
In [45]:
#Updating Parameters
W += -step_size *dW
b += -step_size *db
In [49]:
#Train a Linear Classifier
# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
# gradient descent loop
num_examples = X.shape[0]
for i in range(200):
# evaluate class scores, [N x K]
scores = np.dot(X, W) + b
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
# compute the loss: average cross-entropy loss and regularization
corect_logprobs = -np.log(probs[range(num_examples),Y])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W)
loss = data_loss + reg_loss
if i % 10 == 0:
print ("iteration %d: loss %f" % (i, loss))
# compute the gradient on scores
dscores = probs
dscores[range(num_examples),Y] -= 1
dscores /= num_examples
# backpropate the gradient to the parameters (W,b)
dW = np.dot(X.T, dscores)
db = np.sum(dscores, axis=0, keepdims=True)
dW += reg*W # regularization gradient
# perform a parameter update
W += -step_size * dW
b += -step_size * db
In [50]:
# evaluate training set Accuracy
scores = np.dot(X,W) + b
predicted_class = np.argmax(scores, axis=1)
print("training_accuracy: %.2f" % (np.mean(predicted_class==Y)))
In [52]:
# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=Y, s=40, cmap=plt.cm.Spectral)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.show()
In [ ]: