Even though error rates are low, creating transition matrices from predicted labels gives very different results from the same matrices created from ground truth labels.
Why?
vak.core.predict
currently does not use the same function that vak.core.learncurve.test
uses to find segments from predicted timebin labels. The vak.core.predict
function is more computationally expensive because it finds times of onsets and offsets, while the vak.core.learncurve.test
function just finds wherever labels change and returning the first label after each change point (which will be the same for the rest of the segment).
So worst case scenario would be if those functions give different results. There are tests for this already but maybe they are missing something that only emerges from bigger datasets.
In [1]:
from configparser import ConfigParser
from glob import glob
import json
import os
from pathlib import Path
import shutil
import joblib
import numpy as np
import tensorflow as tf
import tqdm
import vak
In [2]:
VDS_PATH = Path(
'/home/nickledave/Documents/data/BFSongRepository/vak/gy6or6/'
)
In [3]:
train_vds_path = str(VDS_PATH.joinpath('_prep_190726_153000.train.vds.json'))
In [4]:
train_vds = vak.Dataset.load(json_fname=train_vds_path)
if train_vds.are_spects_loaded() is False:
train_vds = train_vds.load_spects()
X_train = train_vds.spects_list()
X_train = np.concatenate(X_train, axis=1)
Y_train = train_vds.lbl_tb_list()
Y_train = np.concatenate(Y_train)
# transpose so rows are time bins
X_train = X_train.T
n_classes = len(train_vds.labelmap)
print(n_classes)
In [5]:
TWEETYNET_VDS_PATH = Path('/home/nickledave/Documents/repos/tweetynet/data/BFSongRepository/gy6or6/vds')
In [6]:
test_vds_path = list(TWEETYNET_VDS_PATH.glob('*test.vds.json'))[0]
In [7]:
num_replicates = 4
train_set_durs = [60, 120, 480]
In [8]:
test_vds = vak.Dataset.load(json_fname=test_vds_path)
if test_vds.are_spects_loaded() is False:
test_vds = test_vds.load_spects()
if test_vds.labelmap != train_vds.labelmap:
raise ValueError(
f'labelmap of test set, {test_vds.labelmap}, does not match labelmap of training set, '
f'{train_vds.labelmap}'
)
def unpack_test():
"""helper function because we want to get back test set unmodified every time we go through
main loop below, without copying giant arrays"""
X_test = test_vds.spects_list()
X_test = np.concatenate(X_test, axis=1)
# transpose so rows are time bins
X_test = X_test.T
Y_test = test_vds.lbl_tb_list()
Y_test = np.concatenate(Y_test)
return X_test, Y_test
# just get X_test to make sure it has the right shape
X_test, _ = unpack_test()
if X_train.shape[-1] != X_test.shape[-1]:
raise ValueError(f'Number of frequency bins in training set spectrograms, {X_train.shape[-1]}, '
f'does not equal number in test set spectrograms, {X_test.shape[-1]}.')
freq_bins = X_test.shape[-1] # number of columns
# concatenate labels into one big string
# used for Levenshtein distance + syllable error rate
Y_train_labels = [voc.annot.labels.tolist() for voc in train_vds.voc_list]
Y_train_labels_for_lev = ''.join([chr(lbl) if type(lbl) is int else lbl
for labels in Y_train_labels for lbl in labels])
Y_test_labels = [voc.annot.labels.tolist() for voc in test_vds.voc_list]
Y_test_labels_for_lev = ''.join([chr(lbl) if type(lbl) is int else lbl
for labels in Y_test_labels for lbl in labels])
replicates = range(1, num_replicates + 1)
NETWORKS = vak.network._load()
In [9]:
# concatenate labels into one big string
# used for Levenshtein distance + syllable error rate
Y_train_labels = [voc.annot.labels.tolist() for voc in train_vds.voc_list]
Y_train_labels_for_lev = ''.join([chr(lbl) if type(lbl) is int else lbl
for labels in Y_train_labels for lbl in labels])
Y_test_labels = [voc.annot.labels.tolist() for voc in test_vds.voc_list]
Y_test_labels_for_lev = ''.join([chr(lbl) if type(lbl) is int else lbl
for labels in Y_test_labels for lbl in labels])
In [10]:
config_path = str(
'/home/nickledave/Documents/repos/tweetynet/src/configs/config_BFSongRepository_gy6or6_.ini'
)
In [11]:
a_config = vak.config.parse_config(config_path)
In [12]:
train_set_dur = 60
replicate = 1
In [13]:
training_records_path = '/home/nickledave/Documents/data/BFSongRepository/vak/gy6or6/results_190726_153021'
In [16]:
spect_scaler = joblib.load(
os.path.join(training_records_path, 'spect_scaler'))
In [17]:
(net_name, net_config) = tuple(a_config.networks.items())[0]
In [18]:
X_test, Y_test = unpack_test()
# Normalize before reshaping to avoid even more convoluted array reshaping.
X_test = spect_scaler.transform(X_test)
In [19]:
# Notice we don't reshape Y_test
(X_test,
_,
num_batches_test) = vak.utils.data.reshape_data_for_batching(
X_test,
net_config.batch_size,
net_config.time_bins,
Y_test)
In [20]:
net_config_dict = net_config._asdict()
net_config_dict['n_syllables'] = n_classes
if 'freq_bins' in net_config_dict:
net_config_dict['freq_bins'] = freq_bins
In [21]:
results_dirname_this_net = os.path.join(training_records_path, net_name)
In [22]:
net = NETWORKS[net_name](**net_config_dict)
# we use latest checkpoint when doing summary for learncurve, assume that's "best trained"
checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir=results_dirname_this_net)
meta_file = glob(checkpoint_file + '*meta')
if len(meta_file) != 1:
raise ValueError('Incorrect number of meta files for last saved checkpoint.\n'
'For checkpoint {}, found these files:\n'
'{}'
.format(checkpoint_file, meta_file))
else:
meta_file = meta_file[0]
data_file = glob(checkpoint_file + '*data*')
if len(data_file) != 1:
raise ValueError('Incorrect number of data files for last saved checkpoint.\n'
'For checkpoint {}, found these files:\n'
'{}'
.format(checkpoint_file, data_file))
else:
data_file = data_file[0]
with tf.Session(graph=net.graph) as sess:
tf.logging.set_verbosity(tf.logging.ERROR)
net.restore(sess=sess,
meta_file=meta_file,
data_file=data_file)
for b in range(num_batches_test): # "b" is "batch number"
d = {
net.X: X_test[:, b * net_config_dict['time_bins']: (b + 1) * net_config_dict['time_bins'], :],
net.lng: [net_config_dict['time_bins']] * net_config_dict['batch_size']}
if 'Y_pred_test' in locals():
preds = sess.run(net.predict, feed_dict=d)
preds = preds.reshape(net_config_dict['batch_size'], -1)
Y_pred_test = np.concatenate((Y_pred_test, preds),
axis=1)
else:
Y_pred_test = sess.run(net.predict, feed_dict=d)
Y_pred_test = Y_pred_test.reshape(net_config_dict['batch_size'], -1)
# again get rid of zero padding predictions
Y_pred_test = Y_pred_test.ravel()[:Y_test.shape[0], np.newaxis]
test_err = np.sum(Y_pred_test != Y_test) / Y_test.shape[0]
In [23]:
Y_pred_test
Out[23]:
In [24]:
Y_test_lbl_tb_list = test_vds.lbl_tb_list()
Get the lengths of each of the individual labeled timebins vectors for each spectrogram, so we can split Y_pred_test
up into vectors of the same sizes below.
In [25]:
Y_test_lens = [arr.shape for arr in Y_test_lbl_tb_list]
But before we split them up, answer the question we asked above:
how different is output of lbl_tb2segments
(used by vak.core.predict
) compared to output of lbl_tb2label
(used by vak.core.learncurve.test
)?
First of all:
do they return vectors of the same length?
In [26]:
Y_pred_test_seg = vak.utils.labels.lbl_tb2labels(Y_pred_test, train_vds.labelmap)
In [27]:
len(Y_pred_test_seg)
Out[27]:
In [28]:
timebin_dur = set([voc.metaspect.timebin_dur for voc in train_vds.voc_list])
timebin_dur = timebin_dur.pop()
In [29]:
Y_pred_test_lbl, onsets, offsets = vak.utils.labels.lbl_tb2segments(Y_pred_test,
train_vds.labelmap,
timebin_dur)
In [30]:
Y_pred_test_lbl.shape
Out[30]:
Yes, vectors returned by each function are the same length.
Okay, what is the edit distance between them?
If 0, it's the same vector.
In [31]:
Y_pred_test_lbl_str = ''.join(Y_pred_test_lbl.tolist())
In [32]:
vak.metrics.levenshtein(Y_pred_test_seg, Y_pred_test_lbl_str)
Out[32]:
To be extra sure:
In [33]:
Y_pred_test_seg == Y_pred_test_lbl_str
Out[33]:
Okay, so that's not the problem -- we're getting the same result for all intents and purposes from test
and predict
.
So even though error is low, maybe we're not recovering the same segments from predict
that we have in the test set?
To figure that out, we need to go ahead and split up Y_pred
into labeled timebin vectors of the same size as those in the original test set, segment each vector, and then look at the segments we get out.
In [34]:
starts = [0]
stops = []
current_start = 0
for a_len in Y_test_lens:
a_len = a_len[0]
stops.append(current_start + a_len)
current_start += a_len
if current_start < Y_test.shape[0]:
starts.append(current_start)
In [35]:
Y_pred_lbl_tb_list = []
for start, stop in zip(starts, stops):
Y_pred_lbl_tb_list.append(Y_pred_test[start:stop])
In [36]:
Y_pred_lens = [arr.shape for arr in Y_pred_lbl_tb_list]
In [37]:
all([pred_len == test_len for pred_len, test_len in zip(Y_pred_lens, Y_test_lens)])
Out[37]:
In [38]:
Y_pred_labels = []
Y_pred_onsets = []
Y_pred_offsets = []
for a_pred_lbl_tb in Y_pred_lbl_tb_list:
lbl, on, off = vak.utils.labels.lbl_tb2segments(a_pred_lbl_tb, train_vds.labelmap, timebin_dur)
Y_pred_labels.append(lbl)
Y_pred_onsets.append(on)
Y_pred_offsets.append(off)
In [39]:
Y_pred_labels[0]
Out[39]:
In [40]:
Y_pred_labels[0].shape
Out[40]:
In [41]:
Y_test_labels_from_seg = []
Y_test_onsets = []
Y_test_offsets = []
for a_test_lbl_tb in Y_test_lbl_tb_list:
lbl, on, off = vak.utils.labels.lbl_tb2segments(a_test_lbl_tb, train_vds.labelmap, timebin_dur)
Y_test_labels_from_seg.append(lbl)
Y_test_onsets.append(on)
Y_test_offsets.append(off)
In [42]:
Y_test_labels_from_seg[0]
Out[42]:
In [43]:
Y_test_labels_from_seg[0].shape
Out[43]:
In [44]:
len(Y_test_labels[0])
Out[44]:
At least for the first vector, there are more segments in the predicted labels.
These could be segments that are not in the ground-truth labels because the person annotating the song removed them.
As a sanity check, do we recover the ground truth labels if we apply vak.utils.lbl_tb2segments
to the ground truth label vector?
In [46]:
np.array_equal(Y_test_labels[0], Y_test_labels_from_seg[0])
Out[46]:
Yes, we do.
So, yes, we're getting extra segments in our predictions somewhere.
How frequent is this?
In [48]:
same_lengths = [Y_pred_seg.shape == Y_test_seg.shape for Y_pred_seg, Y_test_seg in zip(Y_pred_labels, Y_test_labels_from_seg)]
In [49]:
len_acc = sum(same_lengths) / len(same_lengths)
print(f'% with accurate length: {len_acc: 0.4f}')
Only about 3% of them are the right lengths
So what if we subtract the number of segments in the predicted labels from the number in the ground truth labels?
If the number is negative, there are more segments in the predicted labels.
In [51]:
length_diffs = [Y_test_seg.shape[0] - Y_pred_seg.shape[0] for Y_pred_seg, Y_test_seg in zip(Y_pred_labels, Y_test_labels_from_seg)]
In [52]:
print(length_diffs[:5])
In [53]:
np.mean(length_diffs)
Out[53]:
Yes, there are more segments in the predicted labels.
Two approaches to cleaning up:
(1) remove segments lower than a certain duration
(2) remove segments based on syntax
In [54]:
from scipy.io import loadmat
from glob import glob
In [55]:
cd ~/Documents/data/BFSongRepository/gy6or6/032212/
In [56]:
notmats = glob('*.not.mat')
In [57]:
notmat0 = loadmat(notmats[0], squeeze_me=True)
In [58]:
min_dur = notmat0['min_dur']
Visually inspecting onsets from first song in test set to compare with predicted onsets
In [59]:
Y_test_onsets[0]
Out[59]:
In [60]:
Y_pred_onsets[0]
Out[60]:
Okay there's a couple extra predicted onsets.
How many of them are less than the minimum duration for syllables we used when segmenting?
In [75]:
durs_test_0 = (Y_test_offsets[0] - Y_test_onsets[0]) * 1000
print(durs_test_0)
print("number of segments with duration less than minimum syllable duration used to segment: ", np.sum(durs_test_0 < min_dur))
In [77]:
durs_pred_0 = (Y_pred_offsets[0] - Y_pred_onsets[0]) * 1000
print(durs_pred_0)
print("number of segments with duration less than minimum syllable duration used to segment: ", np.sum(durs_pred_0 < min_dur))
More than a couple in the predicted onsets array. What about across all the predicted onsets arrays?
In [66]:
durs_pred = []
lt_min_dur = []
for off, on in zip(Y_pred_offsets, Y_pred_onsets):
durs = (off - on) * 1000
durs_pred.append(durs)
lt_min_dur.append(np.sum(durs < min_dur))
In [67]:
print(lt_min_dur)
Okay and how does that compare to the number of extra segments in each predicted labels array (regardless of whether the segments are less than the minimum duration)?
In [68]:
num_extra = []
for Y_pred_seg, Y_test_seg in zip(Y_pred_labels, Y_test_labels_from_seg):
num_extra.append(Y_pred_seg.shape[0]-Y_test_seg.shape[0])
In [70]:
print(num_extra)
Hmm, looks similar.
So what if we filtered out all the segments less than the minimum duration?
In [71]:
num_extra_minus_num_lt_min = [extra - lt_dur for extra, lt_dur in zip(num_extra, lt_min_dur)]
In [72]:
print(num_extra_minus_num_lt_min)
In [79]:
np.asarray(num_extra_minus_num_lt_min).mean()
Out[79]:
Looks like we'd do a lot better overall, although in a couple cases we get less than the number of syllables in the test set (?)