This is the reproduction of the experiments described and shown in Section 4.3 of the BiternionNets paper, including the results shown in Table 3, as well as Table 4 in Secton 5.
In [1]:
import numpy as np
import pickle, gzip
from collections import Counter
In [2]:
%matplotlib inline
# Font which got unicode math stuff.
import matplotlib as mpl
mpl.rcParams['font.family'] = 'DejaVu Sans'
# Much more readable plots
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [3]:
import DeepFried2 as df
In [4]:
from lbtoolbox.thutil import count_params
from lbtoolbox.augmentation import AugmentationPipeline, Cropper
Functions for training and evaluating that are shared across notebooks.
In [5]:
from training_utils import dotrain, dostats, dopred
These functions can be used for "cyclic" filtering of a 1D array, such as a histogram of angles as will be used later.
In [6]:
def boxfilter(n):
return np.ones(n)/n
def gaussfilter(n, sigma=0.3, retx=False, norm=np.sum):
x = np.arange(-(n-1)/2, (n+1)/2)
x /= np.max(x)
y = 1/(sigma * np.sqrt(2*np.pi)) * np.exp(-x**2/(2*sigma**2))
if norm is not None:
y /= norm(y)
return (x, y) if retx else y
def cyclic_filter(a, f):
# Pad in a cyclic way so that we can use 'same' convolution to cycle.
a = np.pad(a, pad_width=len(f)//2, mode='wrap')
# While it doesn't make a difference here because `f` is symmetric,
# we use `correlate` because `convolve` flips the filter.
return np.correlate(a, f, mode='valid')
The following function generates kind-of a histogram over the cyclic range [0,360] using an array of predictions preds.
In [7]:
def mkheatmap_deg(preds, nbins=360):
"""
- `preds`: The array of angle predictions.
- `norm`: function of the count-values by the return value of which to divide. Examples:
- `sum` for getting a "frequency" histogram (if without filtering).
- `max` for getting a histogram whose peak is at 1 (if without filtering).
- `None` for an unnormalized "counts" histogram.
- `nbins`: Obvious.
"""
# Make sure to be in [0,360) range for Counter.
preds = (preds + 3600) % 360
hm = np.zeros(nbins)
# Discretize the predictions into the bins we have.
ipreds = (preds/(360/nbins)).astype(int)
counter = Counter(ipreds)
# Fill heatmap.
for a, n in counter.items():
hm[a] += n
return hm
Here's an example of how to use this, together with filtering/smoothing:
In [8]:
fig, axes = plt.subplots(1,2, figsize=(12,4))
hm = mkheatmap_deg(np.array([15, 15, 45.]), nbins=3600)
axes[0].plot(np.linspace(0, 360, len(hm)), hm);
hm = cyclic_filter(hm, gaussfilter(n=501))
axes[1].plot(np.linspace(0, 360, len(hm)), hm);
The following creates a "donut heatmap" which is the donut-shaped histogram introduced in the paper.
In [9]:
def donut_heatmap(hm, bg, R, zero_rad=np.deg2rad(-90), colormap=mpl.cm.Spectral_r, aapow=None):
"""
- `hm`: The heatmap distribution as returned by `donut_heatmap`.
- `bg`: Either a tuple specifying the size of the image: `(h, w, depth)`.
Or an image (as a (h,w,d) numpy array) on top of which to draw the heatmap.
- `R`: the width, in pixels, of the donut.
- `zero_rad`: The angle at which to place zero, relative to mathematical zero (on the right).
- `colormap`: Obviously, which colormap to use.
- `aapow`: A fiddling parameter for the fake anti-aliasing effect. The higher the sharper, or `None` for no aa.
"""
if isinstance(bg, tuple):
if len(bg) == 2:
bg += (4,)
bg = np.zeros(bg)
else:
bg = np.copy(bg)
assert len(bg.shape) == 3 and bg.shape[-1] in (3,4), "Can only put a donut onto a color image!"
h, w = bg.shape[:2]
assert w == h, "Currently, we only work for squares, because no time to test ellipses!"
# First, take the distribution as-is and draw it into a donut.
cx, cy = w/2, h/2
for y in range(h):
for x in range(w):
l = np.hypot((x-cx), (y-cy))
lc = (l - (w/2-R/2))/(R/2) # Center it at the center of the donut band and put into [-1,1].
if -1 < lc < 1:
angle = (np.rad2deg(np.arctan2(-(y-cy), x-cx) - zero_rad) + 360) % 360
bg[y,x] = colormap(hm[int(angle*len(hm)/360)])
if aapow is not None:
bg[y,x,3] = 1 - (np.exp(lc**aapow)-1)/(np.exp(1)-1)
return bg
And this combines these functions in order to compute and draw a beautiful donut from given predictions.
In [10]:
def donut(ax, hm, **kw):
im = ax.imshow(donut_heatmap(hm, **kw))
ax.grid(False)
ax.axis('off')
ax.patch.set_visible(False)
return im
def donut_default(ax, angles, **kw):
return donut(ax, cyclic_filter(mkheatmap_deg(angles, nbins=3600), gaussfilter(41)), **kw)
In [11]:
fig, ax = plt.subplots(figsize=(4,4))
donut(ax, hm/np.max(hm), bg=(201,201), R=50, aapow=40);
In [12]:
X, y, n = pickle.load(gzip.open('data/TownCentre.pkl.gz', 'rb'))
Now, we need to split the dataset into train/test sets. But we need to be careful in doing the splitting since we don't want the exact same person to be present in both sets, that would make the task extremely easy and close to cheating.
Unfortunately, I didn't save the split which I used for the paper and hence there's no "official" split. Sorry for that, it's actually a big fuckup of mine. For this reason, the numbers in here can be quite different than the ones from the paper, although the relative improvements of various methods do stay the same.
In [13]:
def split(X, y, n, split=0.9):
itr, ite, trs, tes = [], [], set(), set()
for i, name in enumerate(n):
# Extract the person's ID.
pid = int(name.split('_')[1])
# Decide where to put that person.
if pid in trs:
itr.append(i)
elif pid in tes:
ite.append(i)
else:
if np.random.rand() < split:
itr.append(i)
trs.add(pid)
else:
ite.append(i)
tes.add(pid)
return (X[itr], y[itr], [n[i] for i in itr]), (X[ite], y[ite], [n[i] for i in ite])
In [14]:
(Xtr, ytr, ntr), (Xte, yte, nte) = split(X, y, n, split=0.9)
Xtr, ytr = Xtr.astype(df.floatX)/255, ytr.astype(df.floatX)
Xte, yte = Xte.astype(df.floatX)/255, yte.astype(df.floatX)
print("Trainset: {}".format(len(Xtr)))
print("Testset: {}".format(len(Xte)))
Some examples of usage. Tune the various parameters in order for the heatmap to look good in your case. Note that the norm here is just a rescaling influencing which range of the heatmap is used. The size of the filt should not be too large or it will wash-out the actual distribution, which could be seen as cheating!
In [15]:
fig, axes = plt.subplots(1, 2, figsize=(8,4))
hm_ytr = cyclic_filter(mkheatmap_deg(ytr, nbins=3600), gaussfilter(41))
hm_yte = cyclic_filter(mkheatmap_deg(yte, nbins=3600), gaussfilter(41))
donut(axes[0], hm_ytr/(len(ytr)/400), bg=(201, 201), R=50, aapow=40);
donut(axes[1], hm_yte/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
Here are the "raw", i.e. not re-scaled heatmaps just for reference. Notice the colors are exactly the same as above, but uglier because of higher frequency.
In [16]:
fig, axes = plt.subplots(1, 2, figsize=(8,4))
hm_ytr = cyclic_filter(mkheatmap_deg(ytr, nbins=3600), gaussfilter(41))
hm_yte = cyclic_filter(mkheatmap_deg(yte, nbins=3600), gaussfilter(41))
donut(axes[0], hm_ytr/np.max(hm_ytr), bg=(201, 201), R=50, aapow=40);
donut(axes[1], hm_yte/np.max(hm_yte), bg=(201, 201), R=50, aapow=40);
In [17]:
aug = AugmentationPipeline(Xtr, ytr, Cropper((46,46)))
In [18]:
class Flatten(df.Module):
def symb_forward(self, symb_in):
return symb_in.flatten(2)
In [19]:
def mknet(*outlayers):
return df.Sequential( # 3@46
df.SpatialConvolutionCUDNN( 3, 24, 3, 3), # -> 24@44
df.BatchNormalization(24),
df.ReLU(),
df.SpatialConvolutionCUDNN(24, 24, 3, 3), # -> 24@42
df.BatchNormalization(24),
df.SpatialMaxPoolingCUDNN(2, 2), # -> 24@21
df.ReLU(),
df.SpatialConvolutionCUDNN(24, 48, 3, 3), # -> 48@19
df.BatchNormalization(48),
df.ReLU(),
df.SpatialConvolutionCUDNN(48, 48, 3, 3), # -> 48@17
df.BatchNormalization(48),
df.SpatialMaxPooling(2, 2), # -> 48@9
df.ReLU(),
df.SpatialConvolutionCUDNN(48, 64, 3, 3), # -> 48@7
df.BatchNormalization(64),
df.ReLU(),
df.SpatialConvolutionCUDNN(64, 64, 3, 3), # -> 48@5
df.BatchNormalization(64),
df.ReLU(),
df.Dropout(0.2),
Flatten(),
df.Linear(64*5*5, 512),
df.ReLU(),
df.Dropout(0.5),
*outlayers
)
In [20]:
def ensemble_degrees(angles):
return np.arctan2(np.mean(np.sin(np.deg2rad(angles)), axis=0), np.mean(np.cos(np.deg2rad(angles)), axis=0))
In [21]:
def dopred_deg(model, aug, X, batchsize=100):
return np.rad2deg(dopred(model, aug, X, ensembling=ensemble_degrees, output2preds=lambda x: x, batchsize=batchsize))
In [22]:
def maad_from_deg(preds, reals):
return np.rad2deg(np.abs(np.arctan2(np.sin(np.deg2rad(reals-preds)), np.cos(np.deg2rad(reals-preds)))))
In [23]:
def show_errs_deg(preds, reals, epoch=-1):
errs = maad_from_deg(preds, reals)
mean_errs = np.mean(errs, axis=1)
std_errs = np.std(errs, axis=1)
print("Error: {:5.2f}°±{:5.2f}°".format(np.mean(mean_errs), np.mean(std_errs)))
print("Stdev: {:5.2f}°±{:5.2f}°".format(np.std(mean_errs), np.std(std_errs)))
In [24]:
nets_shallow_linreg = [df.Sequential(
Flatten(),
df.Linear(3*46*46, 1, initW=df.init.const(0)),
) for _ in range(5)]
print('{:.3f}M params'.format(count_params(nets_shallow_linreg[0])/1000000))
In [25]:
trains_shallow_linreg = [dotrain(net, df.MADCriterion(), aug, Xtr, ytr[:,None]) for net in nets_shallow_linreg]
In [26]:
y_preds_shallow_linreg = [dopred_deg(net, aug, Xte) for net in nets_shallow_linreg]
In [27]:
show_errs_deg(y_preds_shallow_linreg, yte[:,None])
In [28]:
fig, axes = plt.subplots(1, 2, figsize=(8,4))
hm_pred = cyclic_filter(mkheatmap_deg(np.concatenate(y_preds_shallow_linreg)[:,0], nbins=3600)/5, gaussfilter(41))
hm_real = cyclic_filter(mkheatmap_deg(yte, nbins=3600), gaussfilter(41))
donut(axes[0], hm_pred/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
donut(axes[1], hm_real/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
In [29]:
nets_linreg = [mknet(df.Linear(512, 1, initW=df.init.const(0))) for _ in range(5)]
print('{:.3f}M params'.format(count_params(nets_linreg[0])/1000000))
In [30]:
trains_linreg = [dotrain(net, df.MADCriterion(), aug, Xtr, ytr[:,None]) for net in nets_linreg]
In [31]:
for model in nets_linreg:
dostats(model, aug, Xtr, batchsize=1000)
In [32]:
y_preds_linreg = [dopred_deg(net, aug, Xte) for net in nets_linreg]
In [33]:
show_errs_deg(y_preds_linreg, yte[:,None])
In [34]:
fig, axes = plt.subplots(1, 2, figsize=(8,4))
hm_pred = cyclic_filter(mkheatmap_deg(np.concatenate(y_preds_linreg)[:,0], nbins=3600)/5, gaussfilter(41))
hm_real = cyclic_filter(mkheatmap_deg(yte, nbins=3600), gaussfilter(41))
donut(axes[0], hm_pred/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
donut(axes[1], hm_real/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
This experiment is not present in the paper, but just in order to prove that the improvements do not lie in the smaller output range of biternions, here is the regression of the angles in radians, i.e. a small range. Spoiler: the results are the same as for degrees.
In [35]:
nets_linreg_rad = [mknet(df.Linear(512, 1, initW=df.init.const(0))) for _ in range(5)]
print('{:.3f}M params'.format(count_params(nets_linreg_rad[0])/1000000))
In [36]:
trains_linreg_rad = [dotrain(net, df.MADCriterion(), aug, Xtr, np.deg2rad(ytr[:,None])) for net in nets_linreg_rad]
In [37]:
for model in nets_linreg_rad:
dostats(model, aug, Xtr, batchsize=1000)
In [38]:
def ensemble_radians(angles):
return np.arctan2(np.mean(np.sin(angles), axis=0), np.mean(np.cos(angles), axis=0))
In [39]:
def dopred_rad(model, aug, X, batchsize=100):
return dopred(model, aug, X, ensembling=ensemble_radians, output2preds=lambda x: x, batchsize=batchsize)
In [40]:
y_preds_linreg_rad = [dopred_rad(net, aug, Xte) for net in nets_linreg_rad]
In [41]:
show_errs_deg(np.rad2deg(y_preds_linreg_rad), yte[:,None])
In [42]:
fig, axes = plt.subplots(1, 2, figsize=(8,4))
hm_pred = cyclic_filter(mkheatmap_deg(np.concatenate(np.rad2deg(y_preds_linreg_rad))[:,0], nbins=3600)/5, gaussfilter(41))
hm_real = cyclic_filter(mkheatmap_deg(yte, nbins=3600), gaussfilter(41))
donut(axes[0], hm_pred/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
donut(axes[1], hm_real/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
In [43]:
nets_linreg_mod = [mknet(df.Linear(512, 1, initW=df.init.const(0))) for _ in range(5)]
print('{:.3f}M params'.format(count_params(nets_linreg_mod[0])/1000000))
In [44]:
class ModuloMADCriterion(df.Criterion):
def symb_forward(self, symb_in, symb_tgt):
self._assert_same_dim(symb_in, symb_tgt)
return df.T.mean(abs(symb_in - symb_tgt) % 360)
In [45]:
trains_linreg_mod = [dotrain(net, ModuloMADCriterion(), aug, Xtr, ytr[:,None]) for net in nets_linreg_mod]
In [46]:
for model in nets_linreg_mod:
dostats(model, aug, Xtr, batchsize=1000)
In [47]:
y_preds_linreg_mod = [dopred_deg(net, aug, Xte) for net in nets_linreg_mod]
In [48]:
show_errs_deg(y_preds_linreg_mod, yte[:,None])
In [49]:
fig, axes = plt.subplots(1, 2, figsize=(8,4))
hm_pred = cyclic_filter(mkheatmap_deg(np.concatenate(y_preds_linreg_mod)[:,0], nbins=3600)/5, gaussfilter(41))
hm_real = cyclic_filter(mkheatmap_deg(yte, nbins=3600), gaussfilter(41))
donut(axes[0], hm_pred/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
donut(axes[1], hm_real/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
As you can see, the mod in the criterion doesn't seem to change anything at all compared to the linear case. That's because our initialization of the last layer makes it output all-zeros in the first run, and from thereon it will be "pulled upwards" so as to output only values within [0,360] almost all the time. Within this range, mod is exactly a no-op, so nothing bad happens.
But actually, even when forcing the network outside this "nice" range initially (using large initialization of the output layer), mod will behave the same as no-mod. Here is such a large initialization as reference, it's left as an exercise to the reader (as if there was one, heh) to do the same experiment without the mod.
In [50]:
net_linreg_mod2 = mknet(df.Linear(512, 1, initW=df.init.normal(20)))
net_linreg_mod2.forward(aug.augbatch_train(Xtr[:10])[0]).T
Out[50]:
In [51]:
train_linreg_mod2 = dotrain(net_linreg_mod2, ModuloMADCriterion(), aug, Xtr, ytr[:,None])
In [52]:
class VonMisesCriterion(df.Criterion):
def __init__(self, kappa, radians=True):
df.Criterion.__init__(self)
self.kappa = kappa
self.torad = 1 if radians else 0.0174532925
def symb_forward(self, symb_in, symb_tgt):
delta_rad = self.torad * (symb_in - symb_tgt)
C = np.exp(2*self.kappa)
return df.T.mean(C - df.T.exp(self.kappa * (1+df.T.cos(delta_rad))))
In [53]:
nets_linreg_vm = [mknet(df.Linear(512, 1, initW=df.init.const(0))) for _ in range(5)]
print('{:.3f}M params'.format(count_params(nets_linreg_vm[0])/1000000))
In [54]:
trains_linreg_vm = [dotrain(net, VonMisesCriterion(0.5, radians=False), aug, Xtr, ytr[:,None]) for net in nets_linreg_vm]
In [55]:
for model in nets_linreg_vm:
dostats(model, aug, Xtr, batchsize=1000)
In [56]:
y_preds_linreg_vm = [dopred_deg(net, aug, Xte) for net in nets_linreg_vm]
In [57]:
show_errs_deg(y_preds_linreg_vm, yte[:,None])
In [58]:
fig, axes = plt.subplots(1, 2, figsize=(8,4))
hm_pred = cyclic_filter(mkheatmap_deg(np.concatenate(y_preds_linreg_vm)[:,0], nbins=3600)/5, gaussfilter(41))
hm_real = cyclic_filter(mkheatmap_deg(yte, nbins=3600), gaussfilter(41))
donut(axes[0], hm_pred/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
donut(axes[1], hm_real/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
For the biternion output, no matter the criterion, we need a few common functions. First, the biternion layer itself (simply normalization), and utilities to convert angles to 2D vectors ("biternions") and vice-versa.
In [59]:
class Biternion(df.Module):
def symb_forward(self, symb_in):
return symb_in / df.T.sqrt((symb_in**2).sum(axis=1, keepdims=True))
In [60]:
def deg2bit(angles_deg):
angles_rad = np.deg2rad(angles_deg)
return np.array([np.cos(angles_rad), np.sin(angles_rad)]).T
In [61]:
def bit2deg(angles_bit):
return (np.rad2deg(np.arctan2(angles_bit[:,1], angles_bit[:,0])) + 360) % 360
In [62]:
class CosineCriterion(df.Criterion):
def symb_forward(self, symb_in, symb_tgt):
# For normalized `p_t_given_x` and `t`, dot-product (batched)
# outputs a cosine value, i.e. between -1 (worst) and 1 (best)
cos_angles = df.T.batched_dot(symb_in, symb_tgt)
# Rescale to a cost going from 2 (worst) to 0 (best) each, then take mean.
return df.T.mean(1 - cos_angles)
In [63]:
nets_linreg_bt_cos = [mknet(df.Linear(512, 2, initW=df.init.normal(0.01)), Biternion()) for _ in range(5)]
print('{:.3f}M params'.format(count_params(nets_linreg_bt_cos[0])/1000000))
In [64]:
trains_linreg_bt_cos = [dotrain(net, CosineCriterion(), aug, Xtr, deg2bit(ytr)) for net in nets_linreg_bt_cos]
In [65]:
for model in nets_linreg_bt_cos:
dostats(model, aug, Xtr, batchsize=1000)
In [66]:
y_preds_linreg_bt_cos = [bit2deg(dopred_deg(net, aug, Xte)) for net in nets_linreg_bt_cos]
In [67]:
show_errs_deg(y_preds_linreg_bt_cos, yte)
In [68]:
fig, axes = plt.subplots(1, 2, figsize=(8,4))
hm_pred = cyclic_filter(mkheatmap_deg(np.concatenate(y_preds_linreg_bt_cos), nbins=3600)/5, gaussfilter(41))
hm_real = cyclic_filter(mkheatmap_deg(yte, nbins=3600), gaussfilter(41))
donut(axes[0], hm_pred/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
donut(axes[1], hm_real/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
In [69]:
class VonMisesBiternionCriterion(df.Criterion):
def __init__(self, kappa):
df.Criterion.__init__(self)
self.kappa = kappa
def symb_forward(self, symb_in, symb_tgt):
cos_angles = df.T.batched_dot(symb_in, symb_tgt)
# This is the only difference to the pure `CosineCriterion`.
# Obviously, they could be in the same class, but I separate them here for narration.
cos_angles = df.T.exp(self.kappa * (cos_angles - 1))
return df.T.mean(1 - cos_angles)
In [70]:
nets_linreg_bt_vm = [mknet(df.Linear(512, 2, initW=df.init.normal(0.01)), Biternion()) for _ in range(5)]
print('{:.3f}M params'.format(count_params(nets_linreg_bt_vm[0])/1000000))
In [71]:
trains_linreg_bt_vm = [dotrain(net, VonMisesBiternionCriterion(1), aug, Xtr, deg2bit(ytr)) for net in nets_linreg_bt_vm]
In [72]:
for model in nets_linreg_bt_vm:
dostats(model, aug, Xtr, batchsize=1000)
In [73]:
y_preds_linreg_bt_vm = [bit2deg(dopred_deg(net, aug, Xte)) for net in nets_linreg_bt_vm]
In [74]:
show_errs_deg(y_preds_linreg_bt_vm, yte)
In [75]:
fig, axes = plt.subplots(1, 2, figsize=(8,4))
hm_pred = cyclic_filter(mkheatmap_deg(np.concatenate(y_preds_linreg_bt_vm), nbins=3600)/5, gaussfilter(41))
hm_real = cyclic_filter(mkheatmap_deg(yte, nbins=3600), gaussfilter(41))
donut(axes[0], hm_pred/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
donut(axes[1], hm_real/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
This second half of the notebook is about reproducing Table 4 of Section 5 of the paper, i.e. investigating how well distributed of an output we can learn from discrete ("weak") class labels. For this, we will quantize the TownCentre labels into various amounts of weak classes.
In [76]:
def quantize_labels(y, ranges):
q = np.empty(y.shape, dtype=np.int32)
for i in range(len(ranges)-1):
# Check for "that" jump.
if ranges[i] < ranges[i+1]:
q[(ranges[i] <= y) & (y < ranges[i+1])] = i
else:
q[(ranges[i] <= y) | (y < ranges[i+1])] = i
return q
The following function simply does softmax prediction, i.e. returns class-probabilities for each sample.
In [77]:
def dopred_sm(model, aug, X, batchsize=100):
return dopred(model, aug, X, ensembling=lambda p_ys: np.mean(p_ys, axis=0), output2preds=lambda x: x, batchsize=batchsize)
Convert class-probabilities as returned by dopred_sm into angle predictions in degrees by simply using the class-centre.
In [78]:
def probs2deg_centre(p_y, centres):
return centres[np.argmax(p_y, axis=-1)]
Convert class-probabilities as returned by dopred_sm into angle predictions in degrees by quadratically interpolating between the highest-probability class and its two neighbors, and then predicting the max of that parabola.
In [79]:
# http://stackoverflow.com/questions/717762/how-to-calculate-the-vertex-of-a-parabola-given-three-points
def parabola_vertex(x, y):
x1, x2, x3 = x
y1, y2, y3 = y
denom = (x1 - x2) * (x1 - x3) * (x2 - x3)
A = (x3 * (y2 - y1) + x2 * (y1 - y3) + x1 * (y3 - y2)) / denom
B = (x3 * x3 * (y1 - y2) + x2 * x2 * (y3 - y1) + x1 * x1 * (y2 - y3)) / denom
C = (x2 * x3 * (x2 - x3) * y1 + x3 * x1 * (x3 - x1) * y2 + x1 * x2 * (x1 - x2) * y3) / denom
return -B / (2*A), C - B*B / (4*A)
#parabola_vertex(1,1, 0,2, -1,1)
#parabola_vertex([1, 0, -1], [1, 2, 1])
In [80]:
def prob2deg_quadint(p_y, centres):
i = np.argmax(p_y)
if i == 0:
return parabola_vertex([centres[-1] - 360, centres[0], centres[1]], p_y[[-1, 0, 1]])
elif i == len(centres)-1:
return parabola_vertex([centres[-2], centres[-1], centres[0] + 360], p_y[[-2, -1, 0]])
else:
return parabola_vertex(centres[[i-1, i, i+1]], p_y[[i-1, i, i+1]])
def probs2deg_quadint(preds, centres):
return np.array([prob2deg_quadint(p, centres)[0] for p in preds])
In [81]:
q3_borders = np.array([0, 120, 240, 361], dtype=df.floatX)
q3_centres = np.array([ 60, 180, 320 ], dtype=df.floatX)
In [82]:
nets_q3_sm = [mknet(df.Linear(512, 3, initW=df.init.const(0)), df.SoftMax()) for _ in range(5)]
trains_q3_sm = [dotrain(net, df.ClassNLLCriterion(), aug, Xtr, quantize_labels(ytr, q3_borders)) for net in nets_q3_sm]
for model in nets_q3_sm:
dostats(model, aug, Xtr, batchsize=1000)
y_preds_q3_sm = [dopred_sm(net, aug, Xte) for net in nets_q3_sm]
In [83]:
show_errs_deg(probs2deg_centre(y_preds_q3_sm, q3_centres), yte)
In [84]:
show_errs_deg([probs2deg_quadint(p_y, q3_centres) for p_y in y_preds_q3_sm], yte)
In [85]:
nets_q3_lr = [mknet(df.Linear(512, 1, initW=df.init.const(0))) for _ in range(5)]
trains_q3_lr = [dotrain(net, df.MADCriterion(), aug, Xtr, q3_centres[quantize_labels(ytr, q3_borders),None]) for net in nets_q3_lr]
for model in nets_q3_lr:
dostats(model, aug, Xtr, batchsize=1000)
y_preds_q3_lr = [dopred_deg(net, aug, Xte) for net in nets_q3_lr]
In [86]:
show_errs_deg(y_preds_q3_lr, yte[:,None])
In [87]:
nets_q3_lr_vm = [mknet(df.Linear(512, 1, initW=df.init.const(0))) for _ in range(5)]
trains_q3_lr_vm = [dotrain(net, VonMisesCriterion(1, radians=False), aug, Xtr, q3_centres[quantize_labels(ytr, q3_borders),None]) for net in nets_q3_lr_vm]
for model in nets_q3_lr_vm:
dostats(model, aug, Xtr, batchsize=1000)
y_preds_q3_lr_vm = [dopred_deg(net, aug, Xte) for net in nets_q3_lr_vm]
In [88]:
show_errs_deg(y_preds_q3_lr_vm, yte[:,None])
In [89]:
nets_q3_bt = [mknet(df.Linear(512, 2, initW=df.init.normal(0.01)), Biternion()) for _ in range(5)]
trains_q3_bt = [dotrain(net, CosineCriterion(), aug, Xtr, deg2bit(q3_centres[quantize_labels(ytr, q3_borders)])) for net in nets_q3_bt]
for model in nets_q3_bt:
dostats(model, aug, Xtr, batchsize=1000)
y_preds_q3_bt = [bit2deg(dopred_deg(net, aug, Xte)) for net in nets_q3_bt]
In [90]:
show_errs_deg(y_preds_q3_bt, yte)
In [91]:
nets_q3_bt_vm = [mknet(df.Linear(512, 2, initW=df.init.normal(0.01)), Biternion()) for _ in range(5)]
trains_q3_bt_vm = [dotrain(net, VonMisesBiternionCriterion(1), aug, Xtr, deg2bit(q3_centres[quantize_labels(ytr, q3_borders)])) for net in nets_q3_bt_vm]
for model in nets_q3_bt_vm:
dostats(model, aug, Xtr, batchsize=1000)
y_preds_q3_bt_vm = [bit2deg(dopred_deg(net, aug, Xte)) for net in nets_q3_bt_vm]
In [92]:
show_errs_deg(y_preds_q3_bt_vm, yte)
I don't want to clutter the remainder of this notebook with all-the-same plots and cells only the number of bins changing, so here they are in a loop. Note that we can't keep all the networks in memory or we'll run out of GPU memory relatively fast, so I'll only keep track of training curves and predictions.
In [ ]:
def train_stats(net, crit, aug, Xtr, ytr, title):
r = dotrain(net, crit, aug, Xtr, ytr, title=title)
dostats(net, aug, Xtr, batchsize=100)
return r
In [ ]:
trains = {}
preds = {}
centres = {}
borders = {}
for name, bord, cent in (
('4x', [315, 45, 135, 225, 315],
[ 0, 90, 180, 270 ]),
('4p', [0, 90, 180, 270, 361],
[ 45, 135, 225, 315 ]),
('6x', [330, 30, 90, 150, 210, 270, 330],
[ 0, 60, 120, 180, 240, 300 ]),
('8x', [337.5, 22.5, 67.5, 112.5, 157.5, 202.5, 247.5, 292.5, 337.5],
[ 0, 45, 90, 135, 180, 225, 270, 315, ]),
('8p', [0, 45, 90, 135, 180, 225, 270, 315, 361],
[ 22.5, 67.5, 112.5, 157.5, 202.5, 247.5, 292.5, 337.5 ]),
('10x',[342, 18, 54, 90, 126, 162, 198, 234, 270, 306, 342],
[ 0, 36, 72, 108, 144, 180, 216, 252, 288, 324 ]),
('12x',[345, 15, 45, 75, 105, 135, 165, 195, 225, 255, 285, 315, 345],
[ 0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330 ]),
):
print(name)
cent = centres[name] = np.array(cent, df.floatX)
bord = borders[name] = np.array(bord, df.floatX)
qtr = quantize_labels(ytr, bord)
trains[name] = {}
preds[name] = {}
# Softmax.
nets = [mknet(df.Linear(512, len(cent), initW=df.init.const(0)), df.SoftMax()) for _ in range(5)]
trains[name]['sm'] = [train_stats(net, df.ClassNLLCriterion(), aug, Xtr, qtr, "Softmax " + name) for net in nets]
preds[name]['sm'] = [dopred_sm(net, aug, Xte) for net in nets]
# Linear regression.
nets = [mknet(df.Linear(512, 1, initW=df.init.const(0))) for _ in range(5)]
trains[name]['lr'] = [train_stats(net, df.MADCriterion(), aug, Xtr, cent[qtr,None], "LinReg " + name) for net in nets]
preds[name]['lr'] = [dopred_deg(net, aug, Xte) for net in nets]
# Linear regression + von-Mises.
nets = [mknet(df.Linear(512, 1, initW=df.init.const(0))) for _ in range(5)]
trains[name]['lr_vm'] = [train_stats(net, VonMisesCriterion(1, radians=False), aug, Xtr, cent[qtr,None], "LinReg-vM " + name) for net in nets]
preds[name]['lr_vm'] = [dopred_deg(net, aug, Xte) for net in nets]
# Biternions.
nets = [mknet(df.Linear(512, 2, initW=df.init.normal(0.01)), Biternion()) for _ in range(5)]
trains[name]['bt'] = [train_stats(net, CosineCriterion(), aug, Xtr, deg2bit(cent[qtr]), "BitReg " + name) for net in nets]
preds[name]['bt'] = [bit2deg(dopred_deg(net, aug, Xte)) for net in nets]
# Biternions + von-Mises.
nets = [mknet(df.Linear(512, 2, initW=df.init.normal(0.01)), Biternion()) for _ in range(5)]
trains[name]['bt_vm'] = [train_stats(net, VonMisesBiternionCriterion(1), aug, Xtr, deg2bit(cent[qtr]), "BitReg-vM " + name) for net in nets]
preds[name]['bt_vm'] = [bit2deg(dopred_deg(net, aug, Xte)) for net in nets]
I don't have the time to make this into a pretty automatic table, the table is in the paper. Here are all the results dumped in not-so-nice-to-read form:
In [110]:
for name in ('4x', '4p', '6x', '8x', '8p', '10x', '12x'):
pre = preds[name]
print()
print(name)
print("=" * len(name))
print()
print("Softmax centre prediction")
show_errs_deg(probs2deg_centre(pre['sm'], centres[name]), yte)
print()
print("Softmax interpolated prediction")
show_errs_deg([probs2deg_quadint(p, centres[name]) for p in pre['sm']], yte)
print()
print("Deep linear regression")
show_errs_deg(pre['lr'], yte[:,None])
print()
print("Deep linear von-Mises regression")
show_errs_deg(pre['lr_vm'], yte[:,None])
print()
print("Deep biternion regression")
show_errs_deg(pre['bt'], yte)
print()
print("Deep biternion von-Mises regression")
show_errs_deg(pre['bt_vm'], yte)
Also, as reported in the paper, the numbers don't do justice to the prediction quality. The softmaxes have good scores, but their output is really "stuck" at the bin-centres. We can see this using the heatmaps:
In [158]:
mkpreds = {
'sm': lambda name: np.concatenate(probs2deg_centre(preds[name]['sm'], centres[name])),
'sm_int': lambda name: np.concatenate([probs2deg_quadint(p, centres[name]) for p in preds[name]['sm']]),
'lr': lambda name: np.concatenate(preds[name]['lr'])[:,0],
'lr_vm': lambda name: np.concatenate(preds[name]['lr_vm'])[:,0],
'bt': lambda name: np.concatenate(preds[name]['bt']),
'bt_vm': lambda name: np.concatenate(preds[name]['bt_vm']),
}
titles = {
'sm': 'Softmax centre ({})',
'sm_int': 'Softmax interpolation ({})',
'lr': 'Linear regression ({})',
'lr_vm': 'Linear von-Mises ({})',
'bt': 'Biternion regression ({})',
'bt_vm': 'Biternion von-Mises ({})',
}
In [159]:
fig, axes = plt.subplots(6, 4, figsize=(4*4,6*4))
for (kind, name), ax in zip(product(('sm', 'sm_int', 'lr', 'lr_vm', 'bt', 'bt_vm'), ('4x', '6x', '8x', '12x')), axes.flat):
hm = cyclic_filter(mkheatmap_deg(mkpreds[kind](name), nbins=3600)/5, gaussfilter(41))
ax.set_title(titles[kind].format(name), fontsize=12)
donut(ax, hm/(len(yte)/400), bg=(201, 201), R=50, aapow=40);
This pretty clearly shows that both Biternions and von-Mises help with cyclic predictions. It also clearly shows that 4 bins is not enough, in any case!
Finally, that "linear regression" plot again clearly shows why linear regression of a circular value doesn't make sense!