Gradient Vanishing

Simplified version of Vanishing Gradients. No regularize operation and remove redundancy code to introduce gradient vanishing as simple as possible.


In [15]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

np.random.seed(0)

N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes

X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')

for j in range(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.0,1,N) # radius
  t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j

fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim([-1,1])
plt.ylim([-1,1])


Out[15]:
(-1, 1)

In [20]:
def relu(x):
    return np.maximum(0, x)

def softmax(x):
    if len(x) > 1:
        shiftx = x - np.amax(x, axis=1)[:, None]
        exps = np.exp(shiftx)
        x = exps / np.sum(exps, axis=1)[:, None]
    else:
        shiftx = x - np.amax(x)
        exps = np.exp(shiftx)
        x = exps / np.sum(exps)
    return x

def loss(y, y_):
    return - np.sum(np.log(y_[range(y.shape[0]), y])) / y.shape[0]

loss_r = []
for step in range(1):
    w1 = 0.1 * np.random.randn(D, 50)
    b1 = np.zeros((1, 50))
    w2 = 0.1 * np.random.randn(50, 50)
    b2 = np.zeros((1, 50))
    w3 = 0.1 * np.random.randn(50, K)
    b3 = np.zeros((1, K))

    # 3 layers forward network
    h1 = relu(np.dot(X, w1) + b1)
    h2 = relu(np.dot(h1, w2) + b2)
    y_ = softmax(np.dot(h2, w3) + b3)
    
    loss_r.append(loss(y, y_))
    
    # backpropagetion
    grad_a3 = (y_ - y) / y.shape[0]
    grad_w3 = np.dot()

In [ ]: