In [ ]:
%matplotlib notebook
from mpl_toolkits.mplot3d import Axes3D
from IPython import display
In [ ]:
# Define the vector of input samples as x, with 20 values sampled from a uniform distribution
# between 0 and 1
x = np.random.uniform(0, 1, 20)
# Generate the target values t from x with small gaussian noise so the estimation won't
# be perfect.
# Define a function f that represents the line that generates t without noise
def f(x): return x * 2
# Create the targets t with some gaussian noise
noise_variance = 0.2 # Variance of the gaussian noise
# Gaussian noise error for each sample in x
noise = np.random.randn(x.shape[0]) * noise_variance
# Create targets t
t = f(x) + noise
In [ ]:
plt.plot(x, t, 'o', label='t')
# Plot the initial line
plt.plot([0, 1], [f(0), f(1)], 'b-', label='f(x)')
plt.xlabel('$x$', fontsize=15)
plt.ylabel('$t$', fontsize=15)
plt.ylim([0,2])
plt.title('inputs (x) vs targets (t)')
plt.grid()
plt.legend(loc=2)
plt.show()
In [ ]:
# Define the neural network function y = x * w
def nn(x, w): return x * w
# Define the cost function
def cost(y, t): return ((t - y)**2).sum()
In [ ]:
# Define a vector of weights for which we want to plot the cost
ws = np.linspace(0, 4, num=100) # weight values
cost_ws = np.vectorize(lambda w: cost(nn(x, w) , t))(ws) # cost for each weight in ws
# Plot
plt.plot(ws, cost_ws, 'r-')
plt.xlabel('$w$', fontsize=15)
plt.ylabel('$\\xi$', fontsize=15)
plt.title('cost vs. weight')
plt.grid()
plt.show()
In [ ]:
# define the gradient function. Remember that y = nn(x, w) = x * w
def gradient(w, x, t):
return 2 * x * (nn(x, w) - t)
# define the update function delta w
def delta_w(w_k, x, t, learning_rate):
return learning_rate * gradient(w_k, x, t).sum()
# Set the initial weight parameter
w = 0.1
# Set the learning rate
learning_rate = 0.1
# Start performing the gradient descent updates, and print the weights and cost:
nb_of_iterations = 4 # number of gradient descent updates
w_cost = [(w, cost(nn(x, w), t))] # List to store the weight,costs values
for i in range(nb_of_iterations):
dw = delta_w(w, x, t, learning_rate) # Get the delta w update
w = w - dw # Update the current weight parameter
w_cost.append((w, cost(nn(x, w), t))) # Add weight,cost to list
# Print the final w, and cost
for i in range(0, len(w_cost)):
print('w({}): {:.4f} \t cost: {:.4f}'.format(i, w_cost[i][0], w_cost[i][1]))
In [ ]:
num_ex = 10000
x1 = np.random.randn(num_ex)
x2 = np.random.randn(num_ex)
X = np.array([x1, x2]).T
y = X[:,0]**2 + (X[:,1]-1)**2 + np.random.rand(num_ex)
y = y.reshape(-1,1)
In [ ]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,0], X[:,1], y)
In [ ]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
In [ ]:
nn_input_dim = 2
hly1_n = 30
hly2_n = 30
nn_output_dim = 1
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.hl1 = nn.Linear(nn_input_dim, hly1_n)
self.hl2 = nn.Linear(hly1_n, hly2_n)
self.out = nn.Linear(hly2_n, nn_output_dim)
def forward(self, x):
x = F.leaky_relu(self.hl1(x))
x = F.leaky_relu(self.hl2(x))
x = self.out(x)
return x
net = Net()
In [ ]:
params = list(net.parameters())
input = Variable(torch.from_numpy(X))
input = input.type(torch.FloatTensor)
out = net(input)
net.zero_grad()
target = Variable(torch.from_numpy(y))
target = target.type(torch.FloatTensor)
criterion = nn.MSELoss()
loss = criterion(out, target)
optimizer = optim.SGD(net.parameters(), lr=0.01)
In [ ]:
n_epochs = 10000
for epoch in range(n_epochs):
# in your training loop:
optimizer.zero_grad() # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if epoch % 2000 == 0:
print('epoch: {0}, loss: {1}'.format(epoch + 1, loss))
In [ ]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,0], X[:,1], y)
In [ ]:
preds = np.array(output.data.tolist())
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,0], X[:,1], preds)
In [ ]:
y
In [ ]:
preds
In [ ]:
net.hl1.weight
In [ ]:
def calc_loss(model, y_true, y_pred, reg):
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
data_loss = np.mean(1/2*(y_true - y_pred)**2)
reg_loss = 1/2*reg * (np.sum(W1**2) + np.sum(W2**2) + np.sum(W3**2))
return data_loss + reg_loss
In [ ]:
net.hl1.weight
In [ ]:
model['W1']
In [ ]:
dW3
In [ ]:
a2.T
In [ ]:
a2.T[0][0]
In [ ]:
a2
In [ ]:
np.dot(a2.T,dout_1)
In [ ]:
hidden_layer_2.shape
In [ ]:
dout_1.shape
In [ ]:
W3.shape
In [ ]:
ln_rate = 0.00001
reg = 0.01
model = {}
model['W1'] = .1 * np.random.rand(nn_input_dim, hly1_n)
model['b1'] = .1 * np.zeros((1, hly1_n))
model['W2'] = .1 * np.random.rand(hly1_n, hly2_n)
model['b2'] = .1 * np.zeros((1, hly2_n))
model['W3'] = .1 * np.random.rand(hly2_n, nn_output_dim)
model['b3'] = .1 * np.zeros((1, nn_output_dim))
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
epoch_list = []
losses_list = []
lr_list = []
for epoch in range(10000):
# forward pass
# z1 = np.dot(X,W1) + b1
# a1 = np.maximum(.01*z1, z1)
hidden_layer_1 = np.maximum(.01*(np.dot(X,W1) + b1),np.dot(X,W1) + b1)
# z2 = np.dot(a1,W2) + b2
# a2 = np.maximum(.01*z2, z2)
hidden_layer_2 = np.maximum(.01*(np.dot(hidden_layer_1,W2) + b1),np.dot(hidden_layer_1,W2) + b2)
out1 = np.dot(hidden_layer_2,W3) + b3
# compute loss
loss = calc_loss(model, y, out1, reg)
# backward pass gradients
# d____ means gradient backward at ____
dout_1 = -(y - out1) #grad into dout_1
dW3 = np.dot(hidden_layer_2.T, dout_1) + (reg * W3)
# dW3 = np.dot(a2.T,dout_1) + (reg * W3)
db3 = np.dot(np.ones((num_ex,1)).T,dout_1)
# da2 = np.dot(dout_1,W3.T)
# dz2 = np.maximum(.01*da2, da2)
# dW2 = np.dot(a1.T, dz2) + (reg * W2)
dhidden_layer_2 = np.dot(dout_1, W3.T)
dhidden_layer_2 = np.maximum(.01*dhidden_layer_2, dhidden_layer_2)
dW2 = np.dot(hidden_layer_1.T, dhidden_layer_2) + (reg * W2)
db2 = np.dot(np.ones((num_ex,1)).T, dhidden_layer_2)
dhidden_layer_1 = np.dot(dhidden_layer_2, W2.T)
# da1 = np.dot(dz2, W2.T)
# dz1 = np.maximum(.01*da1, da1)
dW1 = np.dot(X.T, dhidden_layer_1) + (reg * W1)
db1 = np.dot(np.ones((num_ex,1)).T, dhidden_layer_1)
if epoch == 0:
lowest_loss = loss
if loss <= lowest_loss:
W1 += -ln_rate*dW1
b1 += -ln_rate*db1
W2 += -ln_rate*dW2
b2 += -ln_rate*db2
W3 += -ln_rate*dW3
b3 += -ln_rate*db3
# if loss > lowest_loss:
# break
# if epoch ==
epoch_list.append(epoch)
losses_list.append(loss)
lr_list.append(ln_rate)
if epoch % 100 == 0:
print('epoch: {0}, loss: {1}'.format(epoch+1,loss))
# plt.plot(epoch_list, losses_list, 'k-')
# display.clear_output(wait=True)
# display.display(plt.gcf())
# #plt.pause(.05)
# # plt.plot(epoch_list, lr_list, 'r-')
# plt.show()
In [ ]:
W1
In [ ]:
W1
In [ ]:
dW1
In [ ]:
W1
In [ ]:
W2
In [ ]:
W3
In [ ]:
dW3
In [ ]:
b1
In [ ]:
b2
In [ ]:
b3
In [ ]:
def forward_pass(model):
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
z1 = np.dot(X,W1) + b1
a1 = np.maximum(0, z1)
z2 = np.dot(a1,W2) + b2
a2 = np.maximum(0, z2)
y_hat = np.dot(a2,W3) + b3
return y_hat
def calc_loss(model, y_true, y_pred, reg):
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
data_loss = np.mean(1/2*(y_true - y_pred)**2)
reg_loss = reg * (np.sum(W1**2) + np.sum(W2**2) + np.sum(W3**2))
return data_loss + reg_loss
#forward_pass
y_hat = forward_pass(model)
# backward pass gradients
# d____ means gradient at ____
loss = calc_loss(model, y, y_hat, reg)
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
dloss_wrt_dy_hat = -(y - y_hat) #loss wrt y_hat
dy_hat_wrt_da2 = W3 #y_hat wrt a2
dloss_wrt_da2 = np.dot(W3,dloss_wrt_dy_hat.T)
da2_wrt_dz2 = dy_hat_wrt_da2 #a2 wrt relu of z2
da2_wrt_dz2[da2_wrt_dz2 < 0] = 0
dloss_wrt_dz2 = dloss_wrt_da2.copy()
dloss_wrt_dz2[dloss_wrt_dz2 < 0] = 0
dz2_wrt_da1 = W2 #z2 wrt a1
dloss_wrt_da1 = np.dot(W2, dloss_wrt_dz2)
da1_wrt_dz1 = dz2_wrt_da1 #a1 wrt relu of z1
da1_wrt_dz1[da1_wrt_dz1 < 0] = 0
dloss_wrt_dz1 = dloss_wrt_da1.copy()
dloss_wrt_dz1[dloss_wrt_dz1 < 0] = 0
dz1_wrt_X = W1 #z1 wrt X
In [ ]:
dloss_wrt_da2.shape
In [ ]:
dloss_wrt_dy_hat.T.shape
In [ ]:
W3.shape
In [ ]:
dW3 = np.dot(W3)
In [ ]:
# forward pass
z1 = X.dot(hl1_w) + hl1_b
a1 = np.maximum(0,z1)
y_hat = a1.dot(a1) + out_b
In [ ]:
hl1_out.dot(out_w).shape
In [ ]:
out_b.shape
In [ ]:
loss = np.mean(1/2*(y-y_hat)**2)
In [ ]:
#gradients
dlossdout = -y_hat
doutdoutw =
In [ ]:
dloss_wrt_dy_hat = -(y - y_hat) #loss wrt y_hat
dy_hat_wrt_da2 = W3 #y_hat wrt a2
dloss_wrt_da2 = np.dot(W3,dloss_wrt_dy_hat.T)
da2_wrt_dz2 = dy_hat_wrt_da2 #a2 wrt relu of z2
da2_wrt_dz2[da2_wrt_dz2 < 0] = 0
dloss_wrt_dz2 = dloss_wrt_da2.copy()
dloss_wrt_dz2[dloss_wrt_dz2 < 0] = 0
dz2_wrt_da1 = W2 #z2 wrt a1
dloss_wrt_da1 = np.dot(W2, dloss_wrt_dz2)
da1_wrt_dz1 = dz2_wrt_da1 #a1 wrt relu of z1
da1_wrt_dz1[da1_wrt_dz1 < 0] = 0
dloss_wrt_dz1 = dloss_wrt_da1.copy()
dloss_wrt_dz1[dloss_wrt_dz1 < 0] = 0
dz1_wrt_X = W1 #z1 wrt X
In [ ]:
loss = calc_loss(model, y, y_hat, reg)
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
dloss_wrt_dy_hat = -(y - y_hat) #loss wrt y_hat
dy_hat_wrt_da2 = W3 #y_hat wrt a2
dloss_wrt_da2 = np.dot(W3,dloss_wrt_dy_hat.T)
da2_wrt_dz2 = dy_hat_wrt_da2 #a2 wrt relu of z2
da2_wrt_dz2[da2_wrt_dz2 < 0] = 0
dz2_wrt_da1 = W2 #z2 wrt a1
da1_wrt_dz1 = dz2_wrt_da1 #a1 wrt relu of z1
da1_wrt_dz1[da1_wrt_dz1 < 0] = 0
dz1_wrt_X = W1 #z1 wrt X
In [ ]:
layer1 = np.random.randn(12000000).reshape(4000,3000)
layer2 = np.random.randn(15000000).reshape(3000,5000)
layer3 = np.random.randn(25000000).reshape(5000,5000)
layer4 = np.random.randn(50000000).reshape(5000,10000)
In [ ]:
player1 = torch.from_numpy(layer1)
player2 = torch.from_numpy(layer2)
player3 = torch.from_numpy(layer3)
player4 = torch.from_numpy(layer4)
In [ ]:
%timeit layer1.dot(layer2).dot(layer3).dot(layer4)
In [ ]:
%timeit player1.mm(player2).mm(player3).mm(player4)
In [ ]: