Regression


In [ ]:
%matplotlib notebook
from mpl_toolkits.mplot3d import Axes3D
from IPython import display

1D


In [ ]:
# Define the vector of input samples as x, with 20 values sampled from a uniform distribution
# between 0 and 1
x = np.random.uniform(0, 1, 20)

# Generate the target values t from x with small gaussian noise so the estimation won't
# be perfect.
# Define a function f that represents the line that generates t without noise
def f(x): return x * 2

# Create the targets t with some gaussian noise
noise_variance = 0.2  # Variance of the gaussian noise
# Gaussian noise error for each sample in x
noise = np.random.randn(x.shape[0]) * noise_variance
# Create targets t
t = f(x) + noise

In [ ]:
plt.plot(x, t, 'o', label='t')
# Plot the initial line
plt.plot([0, 1], [f(0), f(1)], 'b-', label='f(x)')
plt.xlabel('$x$', fontsize=15)
plt.ylabel('$t$', fontsize=15)
plt.ylim([0,2])
plt.title('inputs (x) vs targets (t)')
plt.grid()
plt.legend(loc=2)
plt.show()

In [ ]:
# Define the neural network function y = x * w
def nn(x, w): return x * w

# Define the cost function
def cost(y, t): return ((t - y)**2).sum()

In [ ]:
# Define a vector of weights for which we want to plot the cost
ws = np.linspace(0, 4, num=100)  # weight values
cost_ws = np.vectorize(lambda w: cost(nn(x, w) , t))(ws)  # cost for each weight in ws

# Plot
plt.plot(ws, cost_ws, 'r-')
plt.xlabel('$w$', fontsize=15)
plt.ylabel('$\\xi$', fontsize=15)
plt.title('cost vs. weight')
plt.grid()
plt.show()

In [ ]:
# define the gradient function. Remember that y = nn(x, w) = x * w
def gradient(w, x, t): 
    return 2 * x * (nn(x, w) - t)

# define the update function delta w
def delta_w(w_k, x, t, learning_rate):
    return learning_rate * gradient(w_k, x, t).sum()

# Set the initial weight parameter
w = 0.1
# Set the learning rate
learning_rate = 0.1

# Start performing the gradient descent updates, and print the weights and cost:
nb_of_iterations = 4  # number of gradient descent updates
w_cost = [(w, cost(nn(x, w), t))] # List to store the weight,costs values
for i in range(nb_of_iterations):
    dw = delta_w(w, x, t, learning_rate)  # Get the delta w update
    w = w - dw  # Update the current weight parameter
    w_cost.append((w, cost(nn(x, w), t)))  # Add weight,cost to list

# Print the final w, and cost
for i in range(0, len(w_cost)):
    print('w({}): {:.4f} \t cost: {:.4f}'.format(i, w_cost[i][0], w_cost[i][1]))

2D inputs


In [ ]:
num_ex = 10000
x1 = np.random.randn(num_ex)
x2 = np.random.randn(num_ex)
X = np.array([x1, x2]).T
y = X[:,0]**2 + (X[:,1]-1)**2 + np.random.rand(num_ex)
y = y.reshape(-1,1)

In [ ]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,0], X[:,1], y)

pytorch version


In [ ]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F

In [ ]:
nn_input_dim = 2
hly1_n = 30
hly2_n = 30
nn_output_dim = 1

class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.hl1 = nn.Linear(nn_input_dim, hly1_n)
        self.hl2 = nn.Linear(hly1_n, hly2_n)
        self.out = nn.Linear(hly2_n, nn_output_dim)
        
    def forward(self, x):
        x = F.leaky_relu(self.hl1(x))
        x = F.leaky_relu(self.hl2(x))
        x = self.out(x)
        return x
    
net = Net()

In [ ]:
params = list(net.parameters())

input = Variable(torch.from_numpy(X))
input = input.type(torch.FloatTensor)
out = net(input)
net.zero_grad()
target = Variable(torch.from_numpy(y))
target = target.type(torch.FloatTensor)
criterion = nn.MSELoss()
loss = criterion(out, target)
optimizer = optim.SGD(net.parameters(), lr=0.01)

In [ ]:
n_epochs = 10000

for epoch in range(n_epochs):
    # in your training loop:
    optimizer.zero_grad()   # zero the gradient buffers
    output = net(input)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    if epoch % 2000 == 0:
        print('epoch: {0}, loss: {1}'.format(epoch + 1, loss))

In [ ]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,0], X[:,1], y)

In [ ]:
preds = np.array(output.data.tolist())
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,0], X[:,1], preds)

In [ ]:
y

In [ ]:
preds

In [ ]:
net.hl1.weight

numpy version


In [ ]:
def calc_loss(model, y_true, y_pred, reg):
    W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
    data_loss = np.mean(1/2*(y_true - y_pred)**2)
    reg_loss = 1/2*reg * (np.sum(W1**2) + np.sum(W2**2) + np.sum(W3**2))
    return data_loss + reg_loss

In [ ]:
net.hl1.weight

In [ ]:
model['W1']

In [ ]:
dW3

In [ ]:
a2.T

In [ ]:
a2.T[0][0]

In [ ]:
a2

In [ ]:
np.dot(a2.T,dout_1)

In [ ]:
hidden_layer_2.shape

In [ ]:
dout_1.shape

In [ ]:
W3.shape

In [ ]:
ln_rate = 0.00001
reg = 0.01

model = {}
model['W1'] = .1 * np.random.rand(nn_input_dim, hly1_n)
model['b1'] = .1 * np.zeros((1, hly1_n))
model['W2'] = .1 * np.random.rand(hly1_n, hly2_n)
model['b2'] = .1 * np.zeros((1, hly2_n))
model['W3'] = .1 * np.random.rand(hly2_n, nn_output_dim)
model['b3'] = .1 * np.zeros((1, nn_output_dim))

W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']

epoch_list = []
losses_list = []
lr_list = []

for epoch in range(10000):
    # forward pass
#     z1 = np.dot(X,W1) + b1
#     a1 = np.maximum(.01*z1, z1)
    hidden_layer_1 = np.maximum(.01*(np.dot(X,W1) + b1),np.dot(X,W1) + b1)
#     z2 = np.dot(a1,W2) + b2
#     a2 = np.maximum(.01*z2, z2)
    hidden_layer_2 = np.maximum(.01*(np.dot(hidden_layer_1,W2) + b1),np.dot(hidden_layer_1,W2) + b2)
    out1 = np.dot(hidden_layer_2,W3) + b3

    # compute loss
    loss = calc_loss(model, y, out1, reg)

    # backward pass gradients
    # d____ means gradient backward at ____
    dout_1 = -(y - out1) #grad into dout_1

    dW3 = np.dot(hidden_layer_2.T, dout_1) + (reg * W3)
#     dW3 = np.dot(a2.T,dout_1) + (reg * W3)
    db3 = np.dot(np.ones((num_ex,1)).T,dout_1)
#     da2 = np.dot(dout_1,W3.T)
#     dz2 = np.maximum(.01*da2, da2)
#     dW2 = np.dot(a1.T, dz2) + (reg * W2)
    dhidden_layer_2 = np.dot(dout_1, W3.T)
    dhidden_layer_2 = np.maximum(.01*dhidden_layer_2, dhidden_layer_2)  
    dW2 = np.dot(hidden_layer_1.T, dhidden_layer_2) + (reg * W2)
    db2 = np.dot(np.ones((num_ex,1)).T, dhidden_layer_2)
    dhidden_layer_1 = np.dot(dhidden_layer_2, W2.T)
#     da1 = np.dot(dz2, W2.T)
#     dz1 = np.maximum(.01*da1, da1)
    dW1 = np.dot(X.T, dhidden_layer_1) + (reg * W1)
    db1 = np.dot(np.ones((num_ex,1)).T, dhidden_layer_1)
    
    if epoch == 0:
        lowest_loss = loss

    if loss <= lowest_loss:
        W1 += -ln_rate*dW1
        b1 += -ln_rate*db1
        W2 += -ln_rate*dW2
        b2 += -ln_rate*db2
        W3 += -ln_rate*dW3
        b3 += -ln_rate*db3
        
#     if loss > lowest_loss:
#         break

#     if epoch == 
    
    epoch_list.append(epoch)
    losses_list.append(loss)
    lr_list.append(ln_rate)


    if epoch % 100 == 0:
        print('epoch: {0}, loss: {1}'.format(epoch+1,loss))
#         plt.plot(epoch_list, losses_list, 'k-')
#         display.clear_output(wait=True)
#         display.display(plt.gcf())
#         #plt.pause(.05)
# #         plt.plot(epoch_list, lr_list, 'r-')
#         plt.show()

In [ ]:
W1

In [ ]:
W1

In [ ]:
dW1

In [ ]:
W1

In [ ]:
W2

In [ ]:
W3

In [ ]:
dW3

In [ ]:
b1

In [ ]:
b2

In [ ]:
b3

In [ ]:
def forward_pass(model):
    W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
    z1 = np.dot(X,W1) + b1
    a1 = np.maximum(0, z1)
    z2 = np.dot(a1,W2) + b2
    a2 = np.maximum(0, z2)
    y_hat = np.dot(a2,W3) + b3
    return y_hat

def calc_loss(model, y_true, y_pred, reg):
    W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
    data_loss = np.mean(1/2*(y_true - y_pred)**2)
    reg_loss = reg * (np.sum(W1**2) + np.sum(W2**2) + np.sum(W3**2))
    return data_loss + reg_loss

#forward_pass
y_hat = forward_pass(model)

# backward pass gradients
# d____ means gradient at ____
loss = calc_loss(model, y, y_hat, reg)
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
dloss_wrt_dy_hat = -(y - y_hat) #loss wrt y_hat
dy_hat_wrt_da2 = W3 #y_hat wrt a2
dloss_wrt_da2 = np.dot(W3,dloss_wrt_dy_hat.T)

da2_wrt_dz2 = dy_hat_wrt_da2 #a2 wrt relu of z2
da2_wrt_dz2[da2_wrt_dz2 < 0] = 0
dloss_wrt_dz2 = dloss_wrt_da2.copy()
dloss_wrt_dz2[dloss_wrt_dz2 < 0] = 0

dz2_wrt_da1 = W2 #z2 wrt a1
dloss_wrt_da1 = np.dot(W2, dloss_wrt_dz2)

da1_wrt_dz1 = dz2_wrt_da1 #a1 wrt relu of z1
da1_wrt_dz1[da1_wrt_dz1 < 0] = 0
dloss_wrt_dz1 = dloss_wrt_da1.copy()
dloss_wrt_dz1[dloss_wrt_dz1 < 0] = 0

dz1_wrt_X = W1 #z1 wrt X

In [ ]:
dloss_wrt_da2.shape

In [ ]:
dloss_wrt_dy_hat.T.shape

In [ ]:
W3.shape

In [ ]:
dW3 = np.dot(W3)

In [ ]:
# forward pass
z1 = X.dot(hl1_w) + hl1_b
a1 = np.maximum(0,z1)
y_hat = a1.dot(a1) + out_b

In [ ]:
hl1_out.dot(out_w).shape

In [ ]:
out_b.shape

In [ ]:
loss = np.mean(1/2*(y-y_hat)**2)

In [ ]:
#gradients
dlossdout = -y_hat
doutdoutw =

In [ ]:
dloss_wrt_dy_hat = -(y - y_hat) #loss wrt y_hat
dy_hat_wrt_da2 = W3 #y_hat wrt a2
dloss_wrt_da2 = np.dot(W3,dloss_wrt_dy_hat.T)

da2_wrt_dz2 = dy_hat_wrt_da2 #a2 wrt relu of z2
da2_wrt_dz2[da2_wrt_dz2 < 0] = 0
dloss_wrt_dz2 = dloss_wrt_da2.copy()
dloss_wrt_dz2[dloss_wrt_dz2 < 0] = 0

dz2_wrt_da1 = W2 #z2 wrt a1
dloss_wrt_da1 = np.dot(W2, dloss_wrt_dz2)

da1_wrt_dz1 = dz2_wrt_da1 #a1 wrt relu of z1
da1_wrt_dz1[da1_wrt_dz1 < 0] = 0
dloss_wrt_dz1 = dloss_wrt_da1.copy()
dloss_wrt_dz1[dloss_wrt_dz1 < 0] = 0

dz1_wrt_X = W1 #z1 wrt X

In [ ]:
loss = calc_loss(model, y, y_hat, reg)
W1, b1, W2, b2, W3, b3 = model['W1'], model['b1'], model['W2'], model['b2'], model['W3'], model['b3']
dloss_wrt_dy_hat = -(y - y_hat) #loss wrt y_hat
dy_hat_wrt_da2 = W3 #y_hat wrt a2
dloss_wrt_da2 = np.dot(W3,dloss_wrt_dy_hat.T)

da2_wrt_dz2 = dy_hat_wrt_da2 #a2 wrt relu of z2
da2_wrt_dz2[da2_wrt_dz2 < 0] = 0
dz2_wrt_da1 = W2 #z2 wrt a1
da1_wrt_dz1 = dz2_wrt_da1 #a1 wrt relu of z1
da1_wrt_dz1[da1_wrt_dz1 < 0] = 0
dz1_wrt_X = W1 #z1 wrt X

speed testing pytorch and numpy matrix vector multiplication


In [ ]:
layer1 = np.random.randn(12000000).reshape(4000,3000)
layer2 = np.random.randn(15000000).reshape(3000,5000)
layer3 = np.random.randn(25000000).reshape(5000,5000)
layer4 = np.random.randn(50000000).reshape(5000,10000)

In [ ]:
player1 = torch.from_numpy(layer1)
player2 = torch.from_numpy(layer2)
player3 = torch.from_numpy(layer3)
player4 = torch.from_numpy(layer4)

In [ ]:
%timeit layer1.dot(layer2).dot(layer3).dot(layer4)

In [ ]:
%timeit player1.mm(player2).mm(player3).mm(player4)

In [ ]: