A Code Along of Kerem Turgutlu's Notebook
In [1]:
# Classical
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
# PyTorch
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch import optim
# Misc
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = 15, 7
This notebook is inspired by Sebastian Ruder's awesome work from http://ruder.io/optimizing-gradient-descent/. The only thing that won't be implemented in this notebook are gradient calculations - which are provided by PyTorch.
In [2]:
# you can just wget the files from his github (use github download link)
data = pd.read_csv('./data/MNIST/train.csv')
data = np.array(data)
In [3]:
class MNIST(Dataset):
def __init__(self, data):
self.data = data
def __getitem__(self, index):
X = data[index][1:]
y = data[index][0]
return torch.from_numpy(X).float() / 256, torch.LongTensor(np.array([y]))
def __len__(self):
return len(self.data)
In [4]:
class SimpleNet(nn.Module):
def __init__(self, layers):
super().__init__()
self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i + 1])
for i in range(len(layers) - 1)])
def forward(self, x):
for lin in self.linears:
lin_x = lin(x)
x = F.relu(lin_x)
return F.log_softmax(lin_x)
In [5]:
# Create dataset
mnist = MNIST(data)
data_dl = DataLoader(mnist, batch_size=256, shuffle=True, num_workers=0)
In [6]:
epochs = 3 # set epochs
criterion = nn.NLLLoss() # define loss function
#NOTE: http://pytorch.org/docs/master/nn.html?highlight=nllloss#torch.nn.NLLLoss
In [7]:
# init architecture
snet = SimpleNet([784, 100, 100, 10])
# get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]
# keep track of training loss
losses = []
# pars
lr = 1e-2
# Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)
# compute loss and gradients
loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()
# update weights
for w, b in wbs:
w.data -= lr * w.grad.data
b.data -= lr * b.grad.data
# zero the gradients
w.grad.data.zero_()
b.grad.data.zero_()
In [8]:
sgd_losses_ = [(l.data.numpy()[0]) for l in losses]
sgd_log_losses_ = [np.log(l) for l in sgd_losses_]
In [9]:
plt.plot(sgd_log_losses_)
title = plt.title("Vanilla SGD")
In [10]:
# Training PyTorch
# init architecture
snet = SimpleNet([784, 100, 100, 10])
optimizer = optim.Adam(lr = lr, params=snet.parameters())
losses = []
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)
# compute loss and gradients
loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()
# update weights
optimizer.step()
optimizer.zero_grad()
In [11]:
pyadam_losses_ = [(λ.data.numpy()[0]) for λ in losses]
pyadam_log_losses_ = [np.log(λ) for λ in pyadam_losses_]
In [19]:
plt.plot(pyadam_log_losses_)
# plt.plot(adam_log_losses_)
# title = plt.title("PyTorch Adam vs Custom Adam")
title = plt.title("PyTorch Adam")
plt.axis([-20, 520, -4, 1])
Out[19]:
We'd like to pass saddle points with the use of momentum:
### Algorithm
v_new = 0 # init update
rho = 0.9 # set ro
for i in range(epochs):
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):
v_new = rho * v_prev + lr * compute.grads(batch, weight, loss_func)
params -= v_new
v_prev = v_new
In [23]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])
### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]
### keep track of training loss
losses = []
### pars
lr = 1e-3
rho = 0.9
weight_v_prev = [0 for i in range(len(wbs))] # initialize momentum term
bias_v_prev = [0 for i in range(len(wbs))] # initialize momentum term
epochs = 3
### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)
# compute loss and gradients
loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()
# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]
weight_v_new = rho * weight_v_prev[i] + lr * w.grad.data
bias_v_new = rho * bias_v_prev[i] + lr * b.grad.data
weight_v_prev[i] = weight_v_new
bias_v_prev[i] = bias_v_new
w.data -= weight_v_new
b.data -= bias_v_new
# zero the gradients
w.grad.data.zero_()
b.grad.data.zero_()
In [24]:
sgdmom_losses_ = [(λ.data.numpy()[0]) for λ in losses]
sgdmom_log_losses_ = [np.log(λ) for λ in sgdmom_losses_]
In [25]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
title = plt.title("Evolution of Optimizers")
plt.legend(('SGD', 'SGD Momentum'))
Out[25]:
NOTE: this may be wrong see Nesterov section: "not sure if giving correct results, as we expect to see this to be faster ito convergence"
We'd like to have a smarter ball, a ball that nas a ntion of where it's going so that it knows to slow down before the hill slopes up again. While computing grads wrt to weight - ρ * v_prev
, we have a sense of what'll be the next position of our ball, so we leverage this and make a better update.
### Algorithm
v_prev = 0 # init update
rho = 0.9 # set ρ
for i in range(epochs):
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):
v_new = rho * v_prev + lr * compute.grads(batch, params - rho * v_prev, loss_func)
params -= v_new
v_prev = v_new
In [27]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])
### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]
### keep track of training loss
losses = []
### pars
lr = 1e-3
rho = 0.9
weight_v_prev = [0 for i in range(len(wbs))] # initlz momentum term
bias_v_prev = [0 for i in range(len(wbs))] # initlz momentum term
epochs = 3
### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for n, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)
# compute loss
loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]
### WEIGHT UPDATE
# take a step in future as if we are updating
w_original = w.data
w.data -= rho*weight_v_prev[i]
# calculate loss and gradients, to see how it'd
future_outputs = snet(inputs)
future_loss = criterion(outputs, labels.squeeze(-1))
future_loss.backward(retain_graph=True)
future_grad = w.grad.data
weight_v_new = rho*weight_v_prev[i] + lr*future_grad # future grad
weight_v_prev[i] = weight_v_new
w.data = w_original # get the original weight data
w.data -= weight_v_new # update
# zero all gradients
snet.zero_grad()
### BIAS UPDATE
# take a step in future as if we are updating
b_original = b.data
b.data -= rho*bias_v_prev[i]
# calculate loss and gradient, to see how it'd be
future_outputs = snet(inputs)
future_loss = criterion(outputs, labels.squeeze(-1))
future_loss.backward(retain_graph=True)
future_grad = b.grad.data
bias_v_new = rho*bias_v_prev[i] + lr*future_grad
bias_v_prev[i] = bias_v_new
b.data = b_original
b.data -= bias_v_new
# zero all gradients
snet.zero_grad()
In [28]:
nesterov_losses_ = [(λ.data.numpy()[0]) for λ in losses]
nesterov_log_losses_ =[np.log(λ) for λ in nesterov_losses_]
In [29]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
plt.plot(nesterov_log_losses_)
title = plt.title("Loss Plot")
plt.legend(('SGD', 'SGD Momentum', 'Nesterov'))
Out[29]:
Adagrad adapts the learning rate to the parameters, performing larger updates for infrequent and smaller updates for frequent paramters. In its update rule, Adagrad modifies the general learning rate ηη
at each time step tt
for every paramter θiθi
based on the past gradients that've been computed for θiθi
:
One of Adagrad's main benefits is that it eliminates the need to manually tune the λr. Most implementations use a default value of 0.01 and leave it at that.
Adagrad's main weakness is its accumulation of the squared gradients in the denominator: Since every added term is positive, the accumulated Σ keeps growing during training. This in turn causes the learning rate to shrink and eventually become infinitisimally (1/∞) small, at which point the algorithm is no longer able to acquire additional knowledge. The following algorithms aim to resolve this flaw.
### Algorithm
grad_squared = 0
noise = 1e-8
for i in range(epochs):
shuffled = np.random.shuffle(Data)
for batch in get.batch(shuffled, bs):
grads = compute.grads(batch, wegiht, loss_func)
params -= lr * (grads / (np.sqrt(grads_squared) + noise))
grads_squared += grads*grads
In [33]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])
### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]
### keep track of training loss
losses = []
### pars
lr = 1e-3
grads_squared = [[torch.zeros(wb[0].size()), torch.zeros(wb[1].size())] for wb in wbs]
noise = 1e-8
epochs = 3
### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)
# compute loss and gradients
loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()
# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]
w.data -= lr*w.grad.data / torch.sqrt(grads_squared[i][0] + noise)
b.data -= lr*b.grad.data / torch.sqrt(grads_squared[i][1] + noise)
grads_squared[i][0] += w.grad.data*w.grad.data
grads_squared[i][1] += b.grad.data*b.grad.data
# zero the gradients
w.grad.data.zero_()
b.grad.data.zero_()
In [34]:
adagrad_losses_ = [(λ.data.numpy()[0]) for λ in losses]
adagrad_log_losses_ = [np.log(λ) for λ in adagrad_losses_]
In [35]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
plt.plot(nesterov_log_losses_)
plt.plot(adagrad_log_losses_)
title = plt.title("Loss Plot")
plt.legend(('SGD', 'SGD Momentum', 'Nesterov', 'Adagrad'))
Out[35]:
Adadelta restricts the window of accumulated past gradients to some fixed size ww
.
Instead of inefficiently storing ww
previous squared gradients, the Σ of gradients is recursvively defined as a decaying average of all past squared gradients. The running average E[g2]tE[g2]t
at time step tt
then depends (as a fraction γγ
similarly to the Momentum term) only on the previous average and the current gradient:
RMSProp and Adadelta have both been developed independently around the same time stemming from the need to resolve Adagrad's radically diminishing learning rates. RMSProp is in fact identical to the first update vector of Adadelta that we derived above.
### Algorithm
grad_squared = 0
rho = 0.9 # par for exponential smoothing on grad squares
for i in range(epochs):
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):
grads = compute.grads(batch, weight, loss_func)
grads_squared = rho * (grad_squared) + (1 - rho)*(grad*grad)
pars -= lr*(grads / (np.sqrt(grads_squared) + noise)
In [36]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])
### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]
### keep track of training loss
losses = []
### pars
lr = 1e-3
grads_squared = [[torch.zeros(wb[0].size()), torch.zeros(wb[1].size())] for wb in wbs]
noise = 1e-8
rho = 0.9
epochs = 3
### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)
# compute loss and gradients
loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()
# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]
w.data -= lr*w.grad.data / torch.sqrt(grads_squared[i][0] + noise)
b.data -= lr*b.grad.data / torch.sqrt(grads_squared[i][1] + noise)
grads_squared[i][0] = rho*grads_squared[i][0] + (1-rho)*w.grad.data*w.grad.data
grads_squared[i][1] = rho*grads_squared[i][1] + (1-rho)*b.grad.data*b.grad.data
# zero the gradients
w.grad.data.zero_()
b.grad.data.zero_()
In [38]:
rmsprop_losses_ = [(λ.data.numpy()[0]) for λ in losses]
rmsprop_log_losses_ = [np.log(λ) for λ in rmsprop_losses_]
In [39]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
plt.plot(nesterov_log_losses_)
plt.plot(adagrad_log_losses_)
plt.plot(rmsprop_log_losses_)
title = plt.title("Evolution of Optimizers")
plt.legend(('SGD', 'SGD Momentum', 'Nesterov', 'Adagrad', 'RMSProp'))
Out[39]:
In addition to storing an exponentially decaying average of past squared gradients vtvt
like Adadelta and RMSProp, Adam keeps an expontnially decaying average of past gradients mtmt
, similar to momentum. As mtmt
and vtvt
are initialized as vectors of zeros, the authors of Adam observed that they're biased towards zero, especially during the iniital time steps, and especially when the decay rates are small (ie: β1β1
and β2β2
are close to 1
).
They counteract these biases by computing bias-corrected first and second moment estimates:
### Algorithm
m = 0
v = 0
beta1 = 0.999
beta2 = 1e-8
t = 0
for i in range(epochs):
t += 1
shuffled = np.random.shuffle(data)
for batch in get.batch(shuffled, bs):
grads = compute.grads(batch, weight - rho*v_prev, loss_func)
m = beta1*m + (1 - beta1)*grads
v = beta2*v + (1 - beta2)*grads*grads
m_hat = m / (1 - beta1**t) # bias correction for first moment
v_hat = v / (1 - beta1**t) # bias correction for second moment
params -= lr*m_hat/np.sqrt(v_hat) + noise)
In [40]:
### init architecture
snet = SimpleNet([784, 100, 100, 10])
### get weight, bias objects
wbs = [(lin.weight, lin.bias) for lin in snet.linears]
### keep track of training loss
losses = []
### params
lr = 1e-3
### [((m, v), (m, v))] weight and bias m v prevs
m_v_prevs = [[[0, 0], [0, 0]] for i in range(len(wbs))]
noise = 1e-8
beta1 = 0.9
beta2 = 0.999
epochs = 3
t = 0
### Training
for epoch in range(epochs):
print(f'epoch {epoch}')
for i, batch in enumerate(data_dl):
# keep track of time
t += 1
inputs, labels = batch
inputs, labels = Variable(inputs), Variable(labels)
outputs = snet(inputs)
# compute loss and gradients
loss = criterion(outputs, labels.squeeze(-1))
losses.append(loss)
loss.backward()
# update weights
for i, wb in enumerate(wbs):
w = wb[0]
b = wb[1]
# update weight component
w_m_v_t_prev = m_v_prevs[i][0]
w_m_t_prev = w_m_v_t_prev[0]
w_v_t_prev = w_m_v_t_prev[1]
w_m_t_new = beta1*w_m_t_prev + (1 - beta1)*w.grad.data
w_v_t_new = beta2*w_v_t_prev + (1 - beta2)*(w.grad.data*w.grad.data)
w_m_t_new_hat = w_m_t_new / (1 - beta1**t)
w_v_t_new_hat = w_v_t_new / (1 - beta2**t)
m_v_prevs[i][0][0] = w_m_t_new
m_v_prevs[i][0][1] = w_v_t_new
w.data -= lr*w_m_t_new_hat / (torch.sqrt(w_v_t_new_hat) + noise)
# update bias component
b_m_v_t_prev = m_v_prevs[i][1]
b_m_t_prev = b_m_v_t_prev[0]
b_v_t_prev = b_m_v_t_prev[1]
b_m_t_new = beta1*b_m_t_prev + (1 - beta1)*b.grad.data
b_v_t_new = beta2*b_v_t_prev + (1 - beta2)*(b.grad.data*b.grad.data)
b_m_t_new_hat = b_m_t_new / (1 - beta1**t)
b_v_t_new_hat = b_v_t_new / (1 - beta2**t)
m_v_prevs[i][1][0] = b_m_t_new
m_v_prevs[i][1][1] = b_v_t_new
b.data -= lr*b_m_t_new_hat / (torch.sqrt(b_v_t_new_hat) + noise)
# zero the gradients
w.grad.data.zero_()
b.grad.data.zero_()
In [41]:
adam_losses_ = [(λ.data.numpy()[0]) for λ in losses]
adam_log_losses_ = [np.log(λ) for λ in adam_losses_]
In [42]:
plt.plot(sgd_log_losses_)
plt.plot(sgdmom_log_losses_)
plt.plot(nesterov_log_losses_)
plt.plot(adagrad_log_losses_)
plt.plot(rmsprop_log_losses_)
plt.plot(adam_log_losses_)
title = plt.title("Evolution of Optimizers")
plt.legend(('SGD', 'SGD Momentum', 'Nesterov', 'Adagrad', 'RMSProp', 'Adam'))
Out[42]:
In [43]:
### ADAMAX NADAM may be next