In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [ ]:
#export
from exp.nb_05b import *
torch.set_num_threads(2)
In [ ]:
x_train,y_train,x_valid,y_valid = get_data()
Helper function to quickly normalize with the mean and standard deviation from our training set:
In [ ]:
#export
def normalize_to(train, valid):
m,s = train.mean(),train.std()
return normalize(train, m, s), normalize(valid, m, s)
In [ ]:
x_train,x_valid = normalize_to(x_train,x_valid)
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)
Let's check it behaved properly.
In [ ]:
x_train.mean(),x_train.std()
Out[ ]:
In [ ]:
nh,bs = 50,512
c = y_train.max().item()+1
loss_func = F.cross_entropy
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)
To refactor layers, it's useful to have a Lambda
layer that can take a basic function and convert it to a layer you can put in nn.Sequential
.
NB: if you use a Lambda layer with a lambda function, your model won't pickle so you won't be able to save it with PyTorch. So it's best to give a name to the function you're using inside your Lambda (like flatten below).
In [ ]:
#export
class Lambda(nn.Module):
def __init__(self, func):
super().__init__()
self.func = func
def forward(self, x): return self.func(x)
def flatten(x): return x.view(x.shape[0], -1)
This one takes the flat vector of size bs x 784
and puts it back as a batch of images of 28 by 28 pixels:
In [ ]:
def mnist_resize(x): return x.view(-1, 1, 28, 28)
We can now define a simple CNN.
In [ ]:
def get_cnn_model(data):
return nn.Sequential(
Lambda(mnist_resize),
nn.Conv2d( 1, 8, 5, padding=2,stride=2), nn.ReLU(), #14
nn.Conv2d( 8,16, 3, padding=1,stride=2), nn.ReLU(), # 7
nn.Conv2d(16,32, 3, padding=1,stride=2), nn.ReLU(), # 4
nn.Conv2d(32,32, 3, padding=1,stride=2), nn.ReLU(), # 2
nn.AdaptiveAvgPool2d(1),
Lambda(flatten),
nn.Linear(32,data.c)
)
In [ ]:
model = get_cnn_model(data)
Basic callbacks from the previous notebook:
In [ ]:
cbfs = [Recorder, partial(AvgStatsCallback,accuracy)]
In [ ]:
opt = optim.SGD(model.parameters(), lr=0.4)
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)
In [ ]:
%time run.fit(1, learn)
This took a long time to run, so it's time to use a GPU. A simple Callback can make sure the model, inputs and targets are all on the same device.
In [ ]:
# Somewhat more flexible way
device = torch.device('cuda',0)
In [ ]:
class CudaCallback(Callback):
def __init__(self,device): self.device=device
def begin_fit(self): self.model.to(self.device)
def begin_batch(self): self.run.xb,self.run.yb = self.xb.to(self.device),self.yb.to(self.device)
In [ ]:
# Somewhat less flexible, but quite convenient
torch.cuda.set_device(device)
In [ ]:
#export
class CudaCallback(Callback):
def begin_fit(self): self.model.cuda()
def begin_batch(self): self.run.xb,self.run.yb = self.xb.cuda(),self.yb.cuda()
In [ ]:
cbfs.append(CudaCallback)
In [ ]:
model = get_cnn_model(data)
In [ ]:
opt = optim.SGD(model.parameters(), lr=0.4)
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)
In [ ]:
%time run.fit(3, learn)
Now, that's definitely faster!
First we can regroup all the conv/relu in a single function:
In [ ]:
def conv2d(ni, nf, ks=3, stride=2):
return nn.Sequential(
nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride), nn.ReLU())
Another thing is that we can do the mnist resize in a batch transform, that we can do with a Callback.
In [ ]:
#export
class BatchTransformXCallback(Callback):
_order=2
def __init__(self, tfm): self.tfm = tfm
def begin_batch(self): self.run.xb = self.tfm(self.xb)
def view_tfm(*size):
def _inner(x): return x.view(*((-1,)+size))
return _inner
In [ ]:
mnist_view = view_tfm(1,28,28)
cbfs.append(partial(BatchTransformXCallback, mnist_view))
With the AdaptiveAvgPool
, this model can now work on any size input:
In [ ]:
nfs = [8,16,32,32]
In [ ]:
def get_cnn_layers(data, nfs):
nfs = [1] + nfs
return [
conv2d(nfs[i], nfs[i+1], 5 if i==0 else 3)
for i in range(len(nfs)-1)
] + [nn.AdaptiveAvgPool2d(1), Lambda(flatten), nn.Linear(nfs[-1], data.c)]
def get_cnn_model(data, nfs): return nn.Sequential(*get_cnn_layers(data, nfs))
And this helper function will quickly give us everything needed to run the training.
In [ ]:
#export
def get_runner(model, data, lr=0.6, cbs=None, opt_func=None, loss_func = F.cross_entropy):
if opt_func is None: opt_func = optim.SGD
opt = opt_func(model.parameters(), lr=lr)
learn = Learner(model, opt, loss_func, data)
return learn, Runner(cb_funcs=listify(cbs))
In [ ]:
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, data, lr=0.4, cbs=cbfs)
In [ ]:
model
Out[ ]:
In [ ]:
run.fit(3, learn)
Let's say we want to do some telemetry, and want the mean and standard deviation of each activations in the model. First we can do it manually like this:
In [ ]:
class SequentialModel(nn.Module):
def __init__(self, *layers):
super().__init__()
self.layers = nn.ModuleList(layers)
self.act_means = [[] for _ in layers]
self.act_stds = [[] for _ in layers]
def __call__(self, x):
for i,l in enumerate(self.layers):
x = l(x)
self.act_means[i].append(x.data.mean())
self.act_stds [i].append(x.data.std ())
return x
def __iter__(self): return iter(self.layers)
In [ ]:
model = SequentialModel(*get_cnn_layers(data, nfs))
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)
In [ ]:
run.fit(2, learn)
Now we can have a look at the means and stds of the activations at the beginning of training.
In [ ]:
for l in model.act_means: plt.plot(l)
plt.legend(range(6));
In [ ]:
for l in model.act_stds: plt.plot(l)
plt.legend(range(6));
In [ ]:
for l in model.act_means: plt.plot(l[:10])
plt.legend(range(6));
In [ ]:
for l in model.act_stds: plt.plot(l[:10])
plt.legend(range(6));
Hooks are PyTorch object you can add to any nn.Module. A hook will be called when a layer, it is registered to, is executed during the forward pass (forward hook) or the backward pass (backward hook).
Hooks don't require us to rewrite the model.
In [ ]:
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, data, lr=0.5, cbs=cbfs)
In [ ]:
act_means = [[] for _ in model]
act_stds = [[] for _ in model]
A hook is attached to a layer, and needs to have a function that takes three arguments: module, input, output. Here we store the mean and std of the output in the correct position of our list.
In [ ]:
def append_stats(i, mod, inp, outp):
act_means[i].append(outp.data.mean())
act_stds [i].append(outp.data.std())
In [ ]:
for i,m in enumerate(model): m.register_forward_hook(partial(append_stats, i))
In [ ]:
run.fit(1, learn)
In [ ]:
for o in act_means: plt.plot(o)
plt.legend(range(5));
We can refactor this in a Hook class. It's very important to remove the hooks when they are deleted, otherwise there will be references kept and the memory won't be properly released when your model is deleted.
In [ ]:
#export
def children(m): return list(m.children())
class Hook():
def __init__(self, m, f): self.hook = m.register_forward_hook(partial(f, self))
def remove(self): self.hook.remove()
def __del__(self): self.remove()
def append_stats(hook, mod, inp, outp):
if not hasattr(hook,'stats'): hook.stats = ([],[])
means,stds = hook.stats
means.append(outp.data.mean())
stds .append(outp.data.std())
NB: In fastai we use a bool
param to choose whether to make it a forward or backward hook. In the above version we're only supporting forward hooks.
In [ ]:
model = get_cnn_model(data, nfs)
learn,run = get_runner(model, data, lr=0.5, cbs=cbfs)
In [ ]:
hooks = [Hook(l, append_stats) for l in children(model[:4])]
In [ ]:
run.fit(1, learn)
In [ ]:
for h in hooks:
plt.plot(h.stats[0])
h.remove()
plt.legend(range(4));
Let's design our own class that can contain a list of objects. It will behave a bit like a numpy array in the sense that we can index into it via:
[True,False,False,True,...]
)The __iter__
method is there to be able to do things like for x in ...
.
In [ ]:
#export
class ListContainer():
def __init__(self, items): self.items = listify(items)
def __getitem__(self, idx):
if isinstance(idx, (int,slice)): return self.items[idx]
if isinstance(idx[0],bool):
assert len(idx)==len(self) # bool mask
return [o for m,o in zip(idx,self.items) if m]
return [self.items[i] for i in idx]
def __len__(self): return len(self.items)
def __iter__(self): return iter(self.items)
def __setitem__(self, i, o): self.items[i] = o
def __delitem__(self, i): del(self.items[i])
def __repr__(self):
res = f'{self.__class__.__name__} ({len(self)} items)\n{self.items[:10]}'
if len(self)>10: res = res[:-1]+ '...]'
return res
In [ ]:
ListContainer(range(10))
Out[ ]:
In [ ]:
ListContainer(range(100))
Out[ ]:
In [ ]:
t = ListContainer(range(10))
t[[1,2]], t[[False]*8 + [True,False]]
Out[ ]:
We can use it to write a Hooks
class that contains several hooks. We will also use it in the next notebook as a container for our objects in the data block API.
In [ ]:
#export
from torch.nn import init
class Hooks(ListContainer):
def __init__(self, ms, f): super().__init__([Hook(m, f) for m in ms])
def __enter__(self, *args): return self
def __exit__ (self, *args): self.remove()
def __del__(self): self.remove()
def __delitem__(self, i):
self[i].remove()
super().__delitem__(i)
def remove(self):
for h in self: h.remove()
In [ ]:
model = get_cnn_model(data, nfs).cuda()
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)
In [ ]:
hooks = Hooks(model, append_stats)
hooks
Out[ ]:
In [ ]:
hooks.remove()
In [ ]:
x,y = next(iter(data.train_dl))
x = mnist_resize(x).cuda()
In [ ]:
x.mean(),x.std()
Out[ ]:
In [ ]:
p = model[0](x)
p.mean(),p.std()
Out[ ]:
In [ ]:
for l in model:
if isinstance(l, nn.Sequential):
init.kaiming_normal_(l[0].weight)
l[0].bias.data.zero_()
In [ ]:
p = model[0](x)
p.mean(),p.std()
Out[ ]:
Having given an __enter__
and __exit__
method to our Hooks
class, we can use it as a context manager. This makes sure that onces we are out of the with
block, all the hooks have been removed and aren't there to pollute our memory.
In [ ]:
with Hooks(model, append_stats) as hooks:
run.fit(2, learn)
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks:
ms,ss = h.stats
ax0.plot(ms[:10])
ax1.plot(ss[:10])
plt.legend(range(6));
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks:
ms,ss = h.stats
ax0.plot(ms)
ax1.plot(ss)
plt.legend(range(6));
Let's store more than the means and stds and plot histograms of our activations now.
In [ ]:
def append_stats(hook, mod, inp, outp):
if not hasattr(hook,'stats'): hook.stats = ([],[],[])
means,stds,hists = hook.stats
means.append(outp.data.mean().cpu())
stds .append(outp.data.std().cpu())
hists.append(outp.data.cpu().histc(40,0,10)) #histc isn't implemented on the GPU
In [ ]:
model = get_cnn_model(data, nfs).cuda()
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)
In [ ]:
for l in model:
if isinstance(l, nn.Sequential):
init.kaiming_normal_(l[0].weight)
l[0].bias.data.zero_()
In [ ]:
with Hooks(model, append_stats) as hooks: run.fit(1, learn)
In [ ]:
# Thanks to @ste for initial version of histgram plotting code
def get_hist(h): return torch.stack(h.stats[2]).t().float().log1p()
In [ ]:
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
ax.imshow(get_hist(h), origin='lower')
ax.axis('off')
plt.tight_layout()
From the histograms, we can easily get more informations like the min or max of the activations
In [ ]:
def get_min(h):
h1 = torch.stack(h.stats[2]).t().float()
return h1[:2].sum(0)/h1.sum(0)
In [ ]:
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
ax.plot(get_min(h))
ax.set_ylim(0,1)
plt.tight_layout()
Now let's use our model with a generalized ReLU that can be shifted and with maximum value.
In [ ]:
#export
def get_cnn_layers(data, nfs, layer, **kwargs):
nfs = [1] + nfs
return [layer(nfs[i], nfs[i+1], 5 if i==0 else 3, **kwargs)
for i in range(len(nfs)-1)] + [
nn.AdaptiveAvgPool2d(1), Lambda(flatten), nn.Linear(nfs[-1], data.c)]
def conv_layer(ni, nf, ks=3, stride=2, **kwargs):
return nn.Sequential(
nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride), GeneralRelu(**kwargs))
class GeneralRelu(nn.Module):
def __init__(self, leak=None, sub=None, maxv=None):
super().__init__()
self.leak,self.sub,self.maxv = leak,sub,maxv
def forward(self, x):
x = F.leaky_relu(x,self.leak) if self.leak is not None else F.relu(x)
if self.sub is not None: x.sub_(self.sub)
if self.maxv is not None: x.clamp_max_(self.maxv)
return x
def init_cnn(m, uniform=False):
f = init.kaiming_uniform_ if uniform else init.kaiming_normal_
for l in m:
if isinstance(l, nn.Sequential):
f(l[0].weight, a=0.1)
l[0].bias.data.zero_()
def get_cnn_model(data, nfs, layer, **kwargs):
return nn.Sequential(*get_cnn_layers(data, nfs, layer, **kwargs))
In [ ]:
def append_stats(hook, mod, inp, outp):
if not hasattr(hook,'stats'): hook.stats = ([],[],[])
means,stds,hists = hook.stats
means.append(outp.data.mean().cpu())
stds .append(outp.data.std().cpu())
hists.append(outp.data.cpu().histc(40,-7,7))
In [ ]:
model = get_cnn_model(data, nfs, conv_layer, leak=0.1, sub=0.4, maxv=6.)
init_cnn(model)
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)
In [ ]:
with Hooks(model, append_stats) as hooks:
run.fit(1, learn)
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks:
ms,ss,hi = h.stats
ax0.plot(ms[:10])
ax1.plot(ss[:10])
h.remove()
plt.legend(range(5));
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks:
ms,ss,hi = h.stats
ax0.plot(ms)
ax1.plot(ss)
plt.legend(range(5));
In [ ]:
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
ax.imshow(get_hist(h), origin='lower')
ax.axis('off')
plt.tight_layout()
In [ ]:
def get_min(h):
h1 = torch.stack(h.stats[2]).t().float()
return h1[19:22].sum(0)/h1.sum(0)
In [ ]:
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
ax.plot(get_min(h))
ax.set_ylim(0,1)
plt.tight_layout()
In [ ]:
#export
def get_learn_run(nfs, data, lr, layer, cbs=None, opt_func=None, uniform=False, **kwargs):
model = get_cnn_model(data, nfs, layer, **kwargs)
init_cnn(model, uniform=uniform)
return get_runner(model, data, lr=lr, cbs=cbs, opt_func=opt_func)
In [ ]:
sched = combine_scheds([0.5, 0.5], [sched_cos(0.2, 1.), sched_cos(1., 0.1)])
In [ ]:
learn,run = get_learn_run(nfs, data, 1., conv_layer, cbs=cbfs+[partial(ParamScheduler,'lr', sched)])
In [ ]:
run.fit(8, learn)
Uniform init may provide more useful initial weights (normal distribution puts a lot of them at 0).
In [ ]:
learn,run = get_learn_run(nfs, data, 1., conv_layer, uniform=True,
cbs=cbfs+[partial(ParamScheduler,'lr', sched)])
In [ ]:
run.fit(8, learn)
Here's a handy way to export our module without needing to update the file name - after we define this, we can just use nb_auto_export()
in the future (h/t Stas Bekman):
In [ ]:
#export
from IPython.display import display, Javascript
def nb_auto_export():
display(Javascript("""{
const ip = IPython.notebook
if (ip) {
ip.save_notebook()
console.log('a')
const s = `!python notebook2script.py ${ip.notebook_name}`
if (ip.kernel) { ip.kernel.execute(s) }
}
}"""))
In [ ]:
nb_auto_export()
In [ ]: