In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [ ]:
#export
from exp.nb_02 import *
import torch.nn.functional as F
In [ ]:
mpl.rcParams['image.cmap'] = 'gray'
In [ ]:
x_train,y_train,x_valid,y_valid = get_data()
In [ ]:
n,m = x_train.shape
c = y_train.max()+1
nh = 50
In [ ]:
class Model(nn.Module):
def __init__(self, n_in, nh, n_out):
super().__init__()
self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
def __call__(self, x):
for l in self.layers: x = l(x)
return x
In [ ]:
model = Model(m, nh, 10)
In [ ]:
pred = model(x_train)
First, we will need to compute the softmax of our activations. This is defined by:
$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{e^{x_{0}} + e^{x_{1}} + \cdots + e^{x_{n-1}}}$$or more concisely:
$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum_{0 \leq j \leq n-1} e^{x_{j}}}$$In practice, we will need the log of the softmax when we calculate the loss.
In [ ]:
def log_softmax(x): return (x.exp()/(x.exp().sum(-1,keepdim=True))).log()
In [ ]:
sm_pred = log_softmax(pred)
The cross entropy loss for some target $x$ and some prediction $p(x)$ is given by:
$$ -\sum x\, \log p(x) $$But since our $x$s are 1-hot encoded, this can be rewritten as $-\log(p_{i})$ where i is the index of the desired target.
This can be done using numpy-style integer array indexing. Note that PyTorch supports all the tricks in the advanced indexing methods discussed in that link.
In [ ]:
y_train[:3]
Out[ ]:
In [ ]:
sm_pred[[0,1,2], [5,0,4]]
Out[ ]:
In [ ]:
y_train.shape[0]
Out[ ]:
In [ ]:
def nll(input, target): return -input[range(target.shape[0]), target].mean()
In [ ]:
loss = nll(sm_pred, y_train)
In [ ]:
loss
Out[ ]:
Note that the formula
$$\log \left ( \frac{a}{b} \right ) = \log(a) - \log(b)$$gives a simplification when we compute the log softmax, which was previously defined as (x.exp()/(x.exp().sum(-1,keepdim=True))).log()
In [ ]:
def log_softmax(x): return x - x.exp().sum(-1,keepdim=True).log()
In [ ]:
test_near(nll(log_softmax(pred), y_train), loss)
Then, there is a way to compute the log of the sum of exponentials in a more stable way, called the LogSumExp trick. The idea is to use the following formula:
$$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$where a is the maximum of the $x_{j}$.
In [ ]:
def logsumexp(x):
m = x.max(-1)[0]
return m + (x-m[:,None]).exp().sum(-1).log()
This way, we will avoid an overflow when taking the exponential of a big activation. In PyTorch, this is already implemented for us.
In [ ]:
test_near(logsumexp(pred), pred.logsumexp(-1))
So we can use it for our log_softmax
function.
In [ ]:
def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)
In [ ]:
test_near(nll(log_softmax(pred), y_train), loss)
Then use PyTorch's implementation.
In [ ]:
test_near(F.nll_loss(F.log_softmax(pred, -1), y_train), loss)
In PyTorch, F.log_softmax
and F.nll_loss
are combined in one optimized function, F.cross_entropy
.
In [ ]:
test_near(F.cross_entropy(pred, y_train), loss)
Basically the training loop repeats over the following steps:
In [ ]:
loss_func = F.cross_entropy
In [ ]:
#export
def accuracy(out, yb): return (torch.argmax(out, dim=1)==yb).float().mean()
In [ ]:
bs=64 # batch size
xb = x_train[0:bs] # a mini-batch from x
preds = model(xb) # predictions
preds[0], preds.shape
Out[ ]:
In [ ]:
yb = y_train[0:bs]
loss_func(preds, yb)
Out[ ]:
In [ ]:
accuracy(preds, yb)
Out[ ]:
In [ ]:
lr = 0.5 # learning rate
epochs = 1 # how many epochs to train for
In [ ]:
for epoch in range(epochs):
for i in range((n-1)//bs + 1):
# set_trace()
start_i = i*bs
end_i = start_i+bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
loss = loss_func(model(xb), yb)
loss.backward()
with torch.no_grad():
for l in model.layers:
if hasattr(l, 'weight'):
l.weight -= l.weight.grad * lr
l.bias -= l.bias.grad * lr
l.weight.grad.zero_()
l.bias .grad.zero_()
In [ ]:
loss_func(model(xb), yb), accuracy(model(xb), yb)
Out[ ]:
Use nn.Module.__setattr__
and move relu to functional:
In [ ]:
class Model(nn.Module):
def __init__(self, n_in, nh, n_out):
super().__init__()
self.l1 = nn.Linear(n_in,nh)
self.l2 = nn.Linear(nh,n_out)
def __call__(self, x): return self.l2(F.relu(self.l1(x)))
In [ ]:
model = Model(m, nh, 10)
In [ ]:
for name,l in model.named_children(): print(f"{name}: {l}")
In [ ]:
model
Out[ ]:
In [ ]:
model.l1
Out[ ]:
In [ ]:
def fit():
for epoch in range(epochs):
for i in range((n-1)//bs + 1):
start_i = i*bs
end_i = start_i+bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
loss = loss_func(model(xb), yb)
loss.backward()
with torch.no_grad():
for p in model.parameters(): p -= p.grad * lr
model.zero_grad()
In [ ]:
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)
Out[ ]:
Behind the scenes, PyTorch overrides the __setattr__
function in nn.Module
so that the submodules you define are properly registered as parameters of the model.
In [ ]:
class DummyModule():
def __init__(self, n_in, nh, n_out):
self._modules = {}
self.l1 = nn.Linear(n_in,nh)
self.l2 = nn.Linear(nh,n_out)
def __setattr__(self,k,v):
if not k.startswith("_"): self._modules[k] = v
super().__setattr__(k,v)
def __repr__(self): return f'{self._modules}'
def parameters(self):
for l in self._modules.values():
for p in l.parameters(): yield p
In [ ]:
mdl = DummyModule(m,nh,10)
mdl
Out[ ]:
In [ ]:
[o.shape for o in mdl.parameters()]
Out[ ]:
We can use the original layers
approach, but we have to register the modules.
In [ ]:
layers = [nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10)]
In [ ]:
class Model(nn.Module):
def __init__(self, layers):
super().__init__()
self.layers = layers
for i,l in enumerate(self.layers): self.add_module(f'layer_{i}', l)
def __call__(self, x):
for l in self.layers: x = l(x)
return x
In [ ]:
model = Model(layers)
In [ ]:
model
Out[ ]:
nn.ModuleList
does this for us.
In [ ]:
class SequentialModel(nn.Module):
def __init__(self, layers):
super().__init__()
self.layers = nn.ModuleList(layers)
def __call__(self, x):
for l in self.layers: x = l(x)
return x
In [ ]:
model = SequentialModel(layers)
In [ ]:
model
Out[ ]:
In [ ]:
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)
Out[ ]:
nn.Sequential
is a convenient class which does the same as the above:
In [ ]:
model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))
In [ ]:
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)
Out[ ]:
In [ ]:
nn.Sequential??
In [ ]:
model
Out[ ]:
Let's replace our previous manually coded optimization step:
with torch.no_grad():
for p in model.parameters(): p -= p.grad * lr
model.zero_grad()
and instead use just:
opt.step()
opt.zero_grad()
In [ ]:
class Optimizer():
def __init__(self, params, lr=0.5): self.params,self.lr=list(params),lr
def step(self):
with torch.no_grad():
for p in self.params: p -= p.grad * self.lr
def zero_grad(self):
for p in self.params: p.grad.data.zero_()
In [ ]:
model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))
In [ ]:
opt = Optimizer(model.parameters())
In [ ]:
for epoch in range(epochs):
for i in range((n-1)//bs + 1):
start_i = i*bs
end_i = start_i+bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
pred = model(xb)
loss = loss_func(pred, yb)
loss.backward()
opt.step()
opt.zero_grad()
In [ ]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
loss,acc
Out[ ]:
PyTorch already provides this exact functionality in optim.SGD
(it also handles stuff like momentum, which we'll look at later - except we'll be doing it in a more flexible way!)
In [ ]:
#export
from torch import optim
In [ ]:
optim.SGD.step??
In [ ]:
def get_model():
model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))
return model, optim.SGD(model.parameters(), lr=lr)
In [ ]:
model,opt = get_model()
loss_func(model(xb), yb)
Out[ ]:
In [ ]:
for epoch in range(epochs):
for i in range((n-1)//bs + 1):
start_i = i*bs
end_i = start_i+bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
pred = model(xb)
loss = loss_func(pred, yb)
loss.backward()
opt.step()
opt.zero_grad()
In [ ]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
loss,acc
Out[ ]:
Randomized tests can be very useful.
In [ ]:
assert acc>0.7
It's clunky to iterate through minibatches of x and y values separately:
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
Instead, let's do these two steps together, by introducing a Dataset
class:
xb,yb = train_ds[i*bs : i*bs+bs]
In [ ]:
#export
class Dataset():
def __init__(self, x, y): self.x,self.y = x,y
def __len__(self): return len(self.x)
def __getitem__(self, i): return self.x[i],self.y[i]
In [ ]:
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)
assert len(train_ds)==len(x_train)
assert len(valid_ds)==len(x_valid)
In [ ]:
xb,yb = train_ds[0:5]
assert xb.shape==(5,28*28)
assert yb.shape==(5,)
xb,yb
Out[ ]:
In [ ]:
model,opt = get_model()
In [ ]:
for epoch in range(epochs):
for i in range((n-1)//bs + 1):
xb,yb = train_ds[i*bs : i*bs+bs]
pred = model(xb)
loss = loss_func(pred, yb)
loss.backward()
opt.step()
opt.zero_grad()
In [ ]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
assert acc>0.7
loss,acc
Out[ ]:
Previously, our loop iterated over batches (xb, yb) like this:
for i in range((n-1)//bs + 1):
xb,yb = train_ds[i*bs : i*bs+bs]
...
Let's make our loop much cleaner, using a data loader:
for xb,yb in train_dl:
...
In [ ]:
class DataLoader():
def __init__(self, ds, bs): self.ds,self.bs = ds,bs
def __iter__(self):
for i in range(0, len(self.ds), self.bs): yield self.ds[i:i+self.bs]
In [ ]:
train_dl = DataLoader(train_ds, bs)
valid_dl = DataLoader(valid_ds, bs)
In [ ]:
xb,yb = next(iter(valid_dl))
assert xb.shape==(bs,28*28)
assert yb.shape==(bs,)
In [ ]:
plt.imshow(xb[0].view(28,28))
yb[0]
Out[ ]:
In [ ]:
model,opt = get_model()
In [ ]:
def fit():
for epoch in range(epochs):
for xb,yb in train_dl:
pred = model(xb)
loss = loss_func(pred, yb)
loss.backward()
opt.step()
opt.zero_grad()
In [ ]:
fit()
In [ ]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
assert acc>0.7
loss,acc
Out[ ]:
We want our training set to be in a random order, and that order should differ each iteration. But the validation set shouldn't be randomized.
In [ ]:
class Sampler():
def __init__(self, ds, bs, shuffle=False):
self.n,self.bs,self.shuffle = len(ds),bs,shuffle
def __iter__(self):
self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
for i in range(0, self.n, self.bs): yield self.idxs[i:i+self.bs]
In [ ]:
small_ds = Dataset(*train_ds[:10])
In [ ]:
s = Sampler(small_ds,3,False)
[o for o in s]
Out[ ]:
In [ ]:
s = Sampler(small_ds,3,True)
[o for o in s]
Out[ ]:
In [ ]:
def collate(b):
xs,ys = zip(*b)
return torch.stack(xs),torch.stack(ys)
class DataLoader():
def __init__(self, ds, sampler, collate_fn=collate):
self.ds,self.sampler,self.collate_fn = ds,sampler,collate_fn
def __iter__(self):
for s in self.sampler: yield self.collate_fn([self.ds[i] for i in s])
In [ ]:
train_samp = Sampler(train_ds, bs, shuffle=True)
valid_samp = Sampler(valid_ds, bs, shuffle=False)
In [ ]:
train_dl = DataLoader(train_ds, sampler=train_samp, collate_fn=collate)
valid_dl = DataLoader(valid_ds, sampler=valid_samp, collate_fn=collate)
In [ ]:
xb,yb = next(iter(valid_dl))
plt.imshow(xb[0].view(28,28))
yb[0]
Out[ ]:
In [ ]:
xb,yb = next(iter(train_dl))
plt.imshow(xb[0].view(28,28))
yb[0]
Out[ ]:
In [ ]:
xb,yb = next(iter(train_dl))
plt.imshow(xb[0].view(28,28))
yb[0]
Out[ ]:
In [ ]:
model,opt = get_model()
fit()
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
assert acc>0.7
loss,acc
Out[ ]:
In [ ]:
#export
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
In [ ]:
train_dl = DataLoader(train_ds, bs, sampler=RandomSampler(train_ds), collate_fn=collate)
valid_dl = DataLoader(valid_ds, bs, sampler=SequentialSampler(valid_ds), collate_fn=collate)
In [ ]:
model,opt = get_model()
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)
Out[ ]:
PyTorch's defaults work fine for most things however:
In [ ]:
train_dl = DataLoader(train_ds, bs, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_ds, bs, shuffle=False)
In [ ]:
model,opt = get_model()
fit()
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
assert acc>0.7
loss,acc
Out[ ]:
Note that PyTorch's DataLoader
, if you pass num_workers
, will use multiple threads to call your Dataset
.
You always should also have a validation set, in order to identify if you are overfitting.
We will calculate and print the validation loss at the end of each epoch.
(Note that we always call model.train()
before training, and model.eval()
before inference, because these are used by layers such as nn.BatchNorm2d
and nn.Dropout
to ensure appropriate behaviour for these different phases.)
In [ ]:
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
for epoch in range(epochs):
# Handle batchnorm / dropout
model.train()
# print(model.training)
for xb,yb in train_dl:
loss = loss_func(model(xb), yb)
loss.backward()
opt.step()
opt.zero_grad()
model.eval()
# print(model.training)
with torch.no_grad():
tot_loss,tot_acc = 0.,0.
for xb,yb in valid_dl:
pred = model(xb)
tot_loss += loss_func(pred, yb)
tot_acc += accuracy (pred,yb)
nv = len(valid_dl)
print(epoch, tot_loss/nv, tot_acc/nv)
return tot_loss/nv, tot_acc/nv
Question: Are these validation results correct if batch size varies?
get_dls
returns dataloaders for the training and validation sets:
In [ ]:
#export
def get_dls(train_ds, valid_ds, bs, **kwargs):
return (DataLoader(train_ds, batch_size=bs, shuffle=True, **kwargs),
DataLoader(valid_ds, batch_size=bs*2, **kwargs))
Now, our whole process of obtaining the data loaders and fitting the model can be run in 3 lines of code:
In [ ]:
train_dl,valid_dl = get_dls(train_ds, valid_ds, bs)
model,opt = get_model()
loss,acc = fit(5, model, loss_func, opt, train_dl, valid_dl)
In [ ]:
assert acc>0.9
In [ ]:
!python notebook2script.py 03_minibatch_training.ipynb
In [ ]: