In [195]:
import torch
from torch.autograd import Variable
from torch.autograd import gradcheck
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [323]:
# Let's try to find the equation y = 2 * x
# We have 6 examples:- (x,y) = (0.1,0.2), (1,2), (2, 4), (3, 6), (-4, -8), (25, 50)
# Let's assume y is a linear combination of the features x, x^2, x^3
# We know that Normal Equation gives us the exact solution so let's first use that
N = 6
x = np.array([0.1, 1, 2, 3, -4, 25])
y = np.array([0.2, 2, 4, 6, -8, 50])
x_2 = x**2
x_3 = x**3
X = np.ones((N, 4))
X[:,1] = x
X[:,2] = x_2
X[:,3] = x_3
_, D = np.shape(X)
regularization_strength = 0.0
XtX = (X.T).dot(X)
I = np.eye(D, dtype=float)
XtX_RI = XtX + regularization_strength*I
XtY = (X.T).dot(y)
w = np.linalg.solve(XtX_RI, XtY)
y_pred = X.dot(w)
loss = np.sqrt(np.mean((y_pred-y)**2))
# As expected w ~ [0 2 0 0]
print("W : ", w)
print("Predicted Y : ", y_pred)
print("RMS loss : ", loss)
In [555]:
# Setup the training and test tensors
# Let's generate 400 examples
N = 400
x = np.random.uniform(low=-75, high=100, size=N)
y = 2*x
X = np.zeros((N, 3))
X[:,0] = x
X[:,1] = x**2
X[:,2] = x**3
X_tensor = Variable(torch.FloatTensor(X), requires_grad=False)
y_tensor = Variable(torch.FloatTensor(y), requires_grad=False)
# Test set initialization
X_test = np.zeros((3, 3))
X_test[:,0] = np.array([-2.5, 0.0, 19])
X_test[:,1] = X_test[:,0]**2
X_test[:,2] = X_test[:,0]**3
X_test_tsr = Variable(torch.FloatTensor(X_test), requires_grad=False)
# Normalized features
X_min = torch.min(X_tensor,0)
X_max = torch.max(X_tensor,0)
X_mean = torch.mean(X_tensor,0)
X_sub_mean = X_tensor-X_mean.expand_as(X_tensor)
X_max_min = X_max[0]-X_min[0] + 1e-7
X_norm_tsr = X_sub_mean/X_max_min.expand_as(X_sub_mean)
X_test_sub_mean = X_test_tsr-X_mean.expand_as(X_test_tsr)
X_test_norm_tsr = X_test_sub_mean/X_max_min.expand_as(X_test_sub_mean)
In [663]:
# A simple linear Neural Net which is y = w_1*x + w_2*x^2 + w_3*x^3 + b
import math
from time import time
def RunLinearNNTraining(X, y, learning_rate=1e-5, epochs=5000, batch_size=None, X_test=None,
use_optimizer=None, adam_betas=(0.9, 0.999)):
# Neural Net
X_size = X.size()
N = X_size[0]
D_in = X_size[1]
D_out = 1
model = torch.nn.Linear(D_in, D_out)
loss_fn = torch.nn.MSELoss(size_average=True)
# Choose Optimizer
optimizer = None
if use_optimizer:
if use_optimizer == 'SGD':
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
elif use_optimizer == 'Adam':
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=adam_betas)
elif use_optimizer == 'Adadelta':
optimizer = torch.optim.Adadelta(model.parameters(), lr=learning_rate)
elif use_optimizer == 'ASGD':
optimizer = torch.optim.ASGD(model.parameters(), lr=learning_rate)
elif use_optimizer == 'RMSprop':
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
elif use_optimizer == 'Adagrad':
optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)
else:
print("Invalid Optimizer")
use_optimizer=None
losses = []
loss = None
start_time = time()
for t in range(epochs):
num_batches = 1
X_batch = None
y_batch = None
if batch_size:
num_batches = math.ceil(N/batch_size)
else:
batch_size = N
shuffle = torch.randperm(N)
for b in range(num_batches):
lower_index = b*batch_size
upper_index = min(lower_index+batch_size, N)
indices = shuffle[lower_index:upper_index]
X_batch = X[indices]
y_batch = y[indices]
y_pred = model(X_batch)
loss = loss_fn(y_pred, y_batch)
if use_optimizer:
optimizer.zero_grad()
loss.backward()
optimizer.step()
else:
# Zero the gradients before running the backward pass.
model.zero_grad()
loss.backward()
# Update the weights using gradient descent. Each parameter is a Variable, so
# we can access its data and gradients like we did before.
for param in model.parameters():
param.data -= learning_rate * param.grad.data
losses.append(loss.data[0])
end_time = time()
time_taken = end_time - start_time
print("Time Taken = %.2f seconds " % time_taken)
print("Final Loss: ", loss.data[0])
print("Parameters [w_1, w_2, w_3, b]: ")
for param in model.parameters():
print(param.data[0])
# plot Loss vs Iterations
plt.plot(losses)
plt.title('Loss history')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()
# Predictions on Test set
if X_test:
print("Test:")
print("X_test: ", X_test.data)
print("y_pred: ", model(X_test))
It doesn't converge to global optima and for some of them the learning rate has to be set very low if not the gradients explode. Ouch!
It needs a lot of epochs to converge close to global optima. The solution and loss is not good enough.
In [628]:
# use_optimizer can be Adam, RMSprop, Adadelta, ASGD, SGD, Adagrad
RunLinearNNTraining(X=X_tensor, y=y_tensor, batch_size=None, epochs=25000, learning_rate=1e-3,
X_test=X_test_tsr, use_optimizer='Adam')
In [612]:
# Using Adam Optimizer
RunLinearNNTraining(X=X_norm_tsr, y=y_tensor, batch_size=None, epochs=25000, learning_rate=1e-1,
X_test=X_test_norm_tsr, use_optimizer='Adam')
In [618]:
# Look SGD is faster than Adam now. It's curve is more steeper.
RunLinearNNTraining(X=X_norm_tsr, y=y_tensor, batch_size=None, epochs=25000, learning_rate=1e-1,
X_test=X_test_norm_tsr, use_optimizer='SGD')
In [623]:
# SGD doesn't work with unnormalized features so using Adam.
RunLinearNNTraining(X=X_tensor, y=y_tensor, batch_size=25, epochs=3000, learning_rate=1e-3,
X_test=X_test_tsr, use_optimizer='Adam')
In [629]:
# SGD is better than Adam in this case so using SGD
RunLinearNNTraining(X=X_norm_tsr, y=y_tensor, batch_size=25, epochs=3000, learning_rate=1e-1,
X_test=X_test_norm_tsr, use_optimizer='SGD')
In [636]:
# SGD is better than Adam in this case so using SGD
RunLinearNNTraining(X=X_norm_tsr, y=y_tensor, batch_size=1, epochs=25, learning_rate=1e-1,
X_test=X_test_norm_tsr, use_optimizer='SGD')
In [645]:
RunLinearNNTraining(X=X_tensor, y=y_tensor, batch_size=1, epochs=150, learning_rate=1e-3,
X_test=X_test_tsr, use_optimizer='Adam')
We have seen till now that normalizing features, mini-batches of size 1 and SGD optimizer gave us one of the fastest and best convergences. Wouldn't it be cool if we don't have to normalize features? After all, we are trying to converge to a simple y = 2*x equation.
The answer to why Adam and RMSprop optimizers work with unnormalized raw features is the factor 1/sqrt(norm(gradients)) which makes sure the gradients don't explode.
If we want to use unnormalized features then it's best to use Adam optimizer and full batches for training.
In [697]:
RunLinearNNTraining(X=X_tensor, y=y_tensor, batch_size=None, epochs=25000, learning_rate=1e-3,
X_test=X_test_tsr, use_optimizer='Adam', adam_betas=(0.99, 0.999))