the problem

Even though error rates are low, creating transition matrices from predicted labels gives very different results from the same matrices created from ground truth labels.

Why?

vak.core.predict currently does not use the same function that vak.core.learncurve.test uses to find segments from predicted timebin labels. The vak.core.predict function is more computationally expensive because it finds times of onsets and offsets, while the vak.core.learncurve.test function just finds wherever labels change and returning the first label after each change point (which will be the same for the rest of the segment).

So worst case scenario would be if those functions give different results. There are tests for this already but maybe they are missing something that only emerges from bigger datasets.

load a network and get predictions

you can ignore most of this code and scroll to comments below


In [1]:
from configparser import ConfigParser
from glob import glob
import json
import os
from pathlib import Path
import shutil

import joblib
import numpy as np
import tensorflow as tf
import tqdm

import vak

In [2]:
VDS_PATH = Path(
    '/home/nickledave/Documents/data/BFSongRepository/vak/gy6or6/'
)

In [3]:
train_vds_path = str(VDS_PATH.joinpath('_prep_190726_153000.train.vds.json'))

In [4]:
train_vds = vak.Dataset.load(json_fname=train_vds_path)

if train_vds.are_spects_loaded() is False:
    train_vds = train_vds.load_spects()

X_train = train_vds.spects_list()
X_train = np.concatenate(X_train, axis=1)
Y_train = train_vds.lbl_tb_list()
Y_train = np.concatenate(Y_train)
# transpose so rows are time bins
X_train = X_train.T

n_classes = len(train_vds.labelmap)
print(n_classes)


[########################################] | 100% Completed |  0.5s
12

In [5]:
TWEETYNET_VDS_PATH = Path('/home/nickledave/Documents/repos/tweetynet/data/BFSongRepository/gy6or6/vds')

In [6]:
test_vds_path = list(TWEETYNET_VDS_PATH.glob('*test.vds.json'))[0]

In [7]:
num_replicates = 4
train_set_durs = [60, 120, 480]

In [8]:
test_vds = vak.Dataset.load(json_fname=test_vds_path)

if test_vds.are_spects_loaded() is False:
    test_vds = test_vds.load_spects()

if test_vds.labelmap != train_vds.labelmap:
    raise ValueError(
        f'labelmap of test set, {test_vds.labelmap}, does not match labelmap of training set, '
        f'{train_vds.labelmap}'
    )

def unpack_test():
    """helper function because we want to get back test set unmodified every time we go through
    main loop below, without copying giant arrays"""
    X_test = test_vds.spects_list()
    X_test = np.concatenate(X_test, axis=1)
    # transpose so rows are time bins
    X_test = X_test.T
    Y_test = test_vds.lbl_tb_list()
    Y_test = np.concatenate(Y_test)
    return X_test, Y_test

# just get X_test to make sure it has the right shape
X_test, _ = unpack_test()
if X_train.shape[-1] != X_test.shape[-1]:
    raise ValueError(f'Number of frequency bins in training set spectrograms, {X_train.shape[-1]}, '
                     f'does not equal number in test set spectrograms, {X_test.shape[-1]}.')
freq_bins = X_test.shape[-1]  # number of columns

# concatenate labels into one big string
# used for Levenshtein distance + syllable error rate
Y_train_labels = [voc.annot.labels.tolist() for voc in train_vds.voc_list]
Y_train_labels_for_lev = ''.join([chr(lbl) if type(lbl) is int else lbl
                                  for labels in Y_train_labels for lbl in labels])
Y_test_labels = [voc.annot.labels.tolist() for voc in test_vds.voc_list]
Y_test_labels_for_lev = ''.join([chr(lbl) if type(lbl) is int else lbl
                                 for labels in Y_test_labels for lbl in labels])

replicates = range(1, num_replicates + 1)

NETWORKS = vak.network._load()


[########################################] | 100% Completed |  6.1s

In [9]:
# concatenate labels into one big string
# used for Levenshtein distance + syllable error rate
Y_train_labels = [voc.annot.labels.tolist() for voc in train_vds.voc_list]
Y_train_labels_for_lev = ''.join([chr(lbl) if type(lbl) is int else lbl
                                  for labels in Y_train_labels for lbl in labels])
Y_test_labels = [voc.annot.labels.tolist() for voc in test_vds.voc_list]
Y_test_labels_for_lev = ''.join([chr(lbl) if type(lbl) is int else lbl
                                 for labels in Y_test_labels for lbl in labels])

In [10]:
config_path = str(
    '/home/nickledave/Documents/repos/tweetynet/src/configs/config_BFSongRepository_gy6or6_.ini'
)

In [11]:
a_config = vak.config.parse_config(config_path)

In [12]:
train_set_dur = 60
replicate = 1

In [13]:
training_records_path = '/home/nickledave/Documents/data/BFSongRepository/vak/gy6or6/results_190726_153021'

In [16]:
spect_scaler = joblib.load(
    os.path.join(training_records_path, 'spect_scaler'))

In [17]:
(net_name, net_config) = tuple(a_config.networks.items())[0]

In [18]:
X_test, Y_test = unpack_test()
# Normalize before reshaping to avoid even more convoluted array reshaping.
X_test = spect_scaler.transform(X_test)

In [19]:
# Notice we don't reshape Y_test
(X_test,
 _,
 num_batches_test) = vak.utils.data.reshape_data_for_batching(
    X_test,
    net_config.batch_size,
    net_config.time_bins,
    Y_test)

In [20]:
net_config_dict = net_config._asdict()
net_config_dict['n_syllables'] = n_classes
if 'freq_bins' in net_config_dict:
    net_config_dict['freq_bins'] = freq_bins

In [21]:
results_dirname_this_net = os.path.join(training_records_path, net_name)

In [22]:
net = NETWORKS[net_name](**net_config_dict)

# we use latest checkpoint when doing summary for learncurve, assume that's "best trained"
checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir=results_dirname_this_net)

meta_file = glob(checkpoint_file + '*meta')
if len(meta_file) != 1:
    raise ValueError('Incorrect number of meta files for last saved checkpoint.\n'
                     'For checkpoint {}, found these files:\n'
                     '{}'
                     .format(checkpoint_file, meta_file))
else:
    meta_file = meta_file[0]

data_file = glob(checkpoint_file + '*data*')
if len(data_file) != 1:
    raise ValueError('Incorrect number of data files for last saved checkpoint.\n'
                     'For checkpoint {}, found these files:\n'
                     '{}'
                     .format(checkpoint_file, data_file))
else:
    data_file = data_file[0]

with tf.Session(graph=net.graph) as sess:
    tf.logging.set_verbosity(tf.logging.ERROR)

    net.restore(sess=sess,
                meta_file=meta_file,
                data_file=data_file)

    for b in range(num_batches_test):  # "b" is "batch number"
        d = {
            net.X: X_test[:, b * net_config_dict['time_bins']: (b + 1) * net_config_dict['time_bins'], :],
            net.lng: [net_config_dict['time_bins']] * net_config_dict['batch_size']}

        if 'Y_pred_test' in locals():
            preds = sess.run(net.predict, feed_dict=d)
            preds = preds.reshape(net_config_dict['batch_size'], -1)
            Y_pred_test = np.concatenate((Y_pred_test, preds),
                                         axis=1)
        else:
            Y_pred_test = sess.run(net.predict, feed_dict=d)
            Y_pred_test = Y_pred_test.reshape(net_config_dict['batch_size'], -1)

    # again get rid of zero padding predictions
    Y_pred_test = Y_pred_test.ravel()[:Y_test.shape[0], np.newaxis]
    test_err = np.sum(Y_pred_test != Y_test) / Y_test.shape[0]


WARNING:tensorflow:From /home/nickledave/anaconda3/envs/vak-env/lib/python3.6/site-packages/tweetynet/model.py:227: conv2d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.conv2d instead.
WARNING:tensorflow:From /home/nickledave/anaconda3/envs/vak-env/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
WARNING:tensorflow:From /home/nickledave/anaconda3/envs/vak-env/lib/python3.6/site-packages/tweetynet/model.py:232: max_pooling2d (from tensorflow.python.layers.pooling) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.max_pooling2d instead.

WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

WARNING:tensorflow:From /home/nickledave/anaconda3/envs/vak-env/lib/python3.6/site-packages/tweetynet/model.py:260: BasicLSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
WARNING:tensorflow:From /home/nickledave/anaconda3/envs/vak-env/lib/python3.6/site-packages/tweetynet/model.py:270: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
WARNING:tensorflow:From /home/nickledave/anaconda3/envs/vak-env/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py:443: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
WARNING:tensorflow:From /home/nickledave/anaconda3/envs/vak-env/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py:626: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.

okay, now look at predictions -- does vak.test output match vak.predict?

We make sure Y_pred_test is an array.


In [23]:
Y_pred_test


Out[23]:
array([[9],
       [9],
       [9],
       ...,
       [0],
       [0],
       [0]], dtype=int32)

In [24]:
Y_test_lbl_tb_list = test_vds.lbl_tb_list()

Get the lengths of each of the individual labeled timebins vectors for each spectrogram, so we can split Y_pred_test up into vectors of the same sizes below.


In [25]:
Y_test_lens = [arr.shape for arr in Y_test_lbl_tb_list]

But before we split them up, answer the question we asked above:
how different is output of lbl_tb2segments (used by vak.core.predict) compared to output of lbl_tb2label (used by vak.core.learncurve.test)?

First of all:
do they return vectors of the same length?


In [26]:
Y_pred_test_seg = vak.utils.labels.lbl_tb2labels(Y_pred_test, train_vds.labelmap)

In [27]:
len(Y_pred_test_seg)


Out[27]:
12419

In [28]:
timebin_dur = set([voc.metaspect.timebin_dur for voc in train_vds.voc_list])
timebin_dur = timebin_dur.pop()

In [29]:
Y_pred_test_lbl, onsets, offsets = vak.utils.labels.lbl_tb2segments(Y_pred_test,
                                                                    train_vds.labelmap,
                                                                    timebin_dur)

In [30]:
Y_pred_test_lbl.shape


Out[30]:
(12419,)

Yes, vectors returned by each function are the same length.

Okay, what is the edit distance between them?
If 0, it's the same vector.


In [31]:
Y_pred_test_lbl_str = ''.join(Y_pred_test_lbl.tolist())

In [32]:
vak.metrics.levenshtein(Y_pred_test_seg, Y_pred_test_lbl_str)


Out[32]:
0

To be extra sure:


In [33]:
Y_pred_test_seg == Y_pred_test_lbl_str


Out[33]:
True

Okay, so that's not the problem -- we're getting the same result for all intents and purposes from test and predict.

if that's not the problem, what is?

So even though error is low, maybe we're not recovering the same segments from predict that we have in the test set?

To figure that out, we need to go ahead and split up Y_pred into labeled timebin vectors of the same size as those in the original test set, segment each vector, and then look at the segments we get out.


In [34]:
starts = [0]
stops = []
current_start = 0
for a_len in Y_test_lens:
    a_len = a_len[0]
    stops.append(current_start + a_len)
    current_start += a_len
    if current_start < Y_test.shape[0]:
        starts.append(current_start)

In [35]:
Y_pred_lbl_tb_list = []
for start, stop in zip(starts, stops):
    Y_pred_lbl_tb_list.append(Y_pred_test[start:stop])

In [36]:
Y_pred_lens = [arr.shape for arr in Y_pred_lbl_tb_list]

In [37]:
all([pred_len == test_len for pred_len, test_len in zip(Y_pred_lens, Y_test_lens)])


Out[37]:
True

In [38]:
Y_pred_labels = []
Y_pred_onsets = []
Y_pred_offsets = []
for a_pred_lbl_tb in Y_pred_lbl_tb_list:
    lbl, on, off = vak.utils.labels.lbl_tb2segments(a_pred_lbl_tb, train_vds.labelmap, timebin_dur)
    Y_pred_labels.append(lbl)
    Y_pred_onsets.append(on)
    Y_pred_offsets.append(off)

In [39]:
Y_pred_labels[0]


Out[39]:
array(['i', 'i', 'i', 'i', 'i', 'i', 'i', 'b', 'i', 'i', 'i', 'a', 'b',
       'c', 'd', 'e', 'e', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'a', 'b',
       'c', 'd', 'e', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'a', 'b', 'b',
       'c', 'd', 'e', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'a', 'b', 'c',
       'd', 'e', 'e', 'f'], dtype='<U1')

In [40]:
Y_pred_labels[0].shape


Out[40]:
(56,)

In [41]:
Y_test_labels_from_seg = []
Y_test_onsets = []
Y_test_offsets = []
for a_test_lbl_tb in Y_test_lbl_tb_list:
    lbl, on, off = vak.utils.labels.lbl_tb2segments(a_test_lbl_tb, train_vds.labelmap, timebin_dur)
    Y_test_labels_from_seg.append(lbl)
    Y_test_onsets.append(on)
    Y_test_offsets.append(off)

In [42]:
Y_test_labels_from_seg[0]


Out[42]:
array(['i', 'i', 'i', 'i', 'i', 'i', 'a', 'b', 'c', 'd', 'e', 'e', 'f',
       'g', 'h', 'j', 'k', 'i', 'a', 'b', 'c', 'd', 'e', 'e', 'f', 'g',
       'h', 'j', 'k', 'i', 'a', 'b', 'c', 'd', 'e', 'e', 'f', 'g', 'h',
       'j', 'k', 'i', 'a', 'b', 'c', 'd', 'e', 'e', 'f'], dtype='<U1')

In [43]:
Y_test_labels_from_seg[0].shape


Out[43]:
(49,)

In [44]:
len(Y_test_labels[0])


Out[44]:
49

At least for the first vector, there are more segments in the predicted labels.

These could be segments that are not in the ground-truth labels because the person annotating the song removed them.

As a sanity check, do we recover the ground truth labels if we apply vak.utils.lbl_tb2segments to the ground truth label vector?


In [46]:
np.array_equal(Y_test_labels[0], Y_test_labels_from_seg[0])


Out[46]:
True

Yes, we do.

So, yes, we're getting extra segments in our predictions somewhere.

How frequent is this?


In [48]:
same_lengths = [Y_pred_seg.shape == Y_test_seg.shape for Y_pred_seg, Y_test_seg in zip(Y_pred_labels, Y_test_labels_from_seg)]

In [49]:
len_acc = sum(same_lengths) / len(same_lengths)
print(f'% with accurate length: {len_acc: 0.4f}')


% with accurate length:  0.0357

Only about 3% of them are the right lengths

So what if we subtract the number of segments in the predicted labels from the number in the ground truth labels?
If the number is negative, there are more segments in the predicted labels.


In [51]:
length_diffs = [Y_test_seg.shape[0] - Y_pred_seg.shape[0] for Y_pred_seg, Y_test_seg in zip(Y_pred_labels, Y_test_labels_from_seg)]

In [52]:
print(length_diffs[:5])


[-7, -5, -5, -12, -4]

In [53]:
np.mean(length_diffs)


Out[53]:
-7.666666666666667

Yes, there are more segments in the predicted labels.

Two approaches to cleaning up:
(1) remove segments lower than a certain duration

  • this might help if all the spurious segments are shorter than typical syllables
  • it won't help though if e.g. calls are being labeled as syllables, and those calls would have been segments in the ground truth data, but the annotator removed those segments since they weren't syllables
  • problem: what label to give the segment to throw away? If silence on both sides (probably almost all cases) could just set to silence?

(2) remove segments based on syntax

  • throw away segments where label is below some threshold of ever occurring
  • this prevents us from doing an analysis where we ask if recovered original syntax, though
  • because of course we cover the original syntax if we use the original syntax to throw away things that don't match it
  • but I think this is a good way to show the work that actually needs to be done to get this to be useful in the lab, and highlights issues with previous work

In [54]:
from scipy.io import loadmat
from glob import glob

In [55]:
cd ~/Documents/data/BFSongRepository/gy6or6/032212/


/home/nickledave/Documents/data/BFSongRepository/gy6or6/032212

In [56]:
notmats = glob('*.not.mat')

In [57]:
notmat0 = loadmat(notmats[0], squeeze_me=True)

In [58]:
min_dur = notmat0['min_dur']

Visually inspecting onsets from first song in test set to compare with predicted onsets


In [59]:
Y_test_onsets[0]


Out[59]:
array([0.936, 1.1  , 1.292, 1.462, 1.636, 1.814, 1.972, 2.064, 2.166,
       2.236, 2.31 , 2.382, 2.458, 2.584, 2.704, 2.762, 2.834, 3.054,
       3.206, 3.3  , 3.408, 3.48 , 3.554, 3.628, 3.698, 3.83 , 3.968,
       4.022, 4.098, 4.322, 4.492, 4.584, 4.696, 4.772, 4.844, 4.92 ,
       4.994, 5.128, 5.27 , 5.324, 5.404, 5.624, 5.796, 5.892, 5.996,
       6.074, 6.148, 6.222, 6.298])

In [60]:
Y_pred_onsets[0]


Out[60]:
array([0.   , 0.348, 0.392, 0.924, 1.1  , 1.294, 1.462, 1.466, 1.468,
       1.636, 1.814, 1.97 , 2.064, 2.166, 2.236, 2.308, 2.382, 2.458,
       2.46 , 2.584, 2.704, 2.762, 2.836, 3.054, 3.208, 3.3  , 3.406,
       3.48 , 3.554, 3.628, 3.698, 3.83 , 3.968, 4.024, 4.096, 4.324,
       4.492, 4.584, 4.624, 4.696, 4.772, 4.844, 4.92 , 4.992, 5.128,
       5.268, 5.324, 5.402, 5.624, 5.796, 5.89 , 5.998, 6.074, 6.148,
       6.222, 6.298])

Okay there's a couple extra predicted onsets.

How many of them are less than the minimum duration for syllables we used when segmenting?


In [75]:
durs_test_0 = (Y_test_offsets[0] - Y_test_onsets[0]) * 1000
print(durs_test_0)
print("number of segments with duration less than minimum syllable duration used to segment: ", np.sum(durs_test_0 < min_dur))


[ 56.  78.  88. 108.  92.  84.  40.  36.  52.  62.  62.  58.  80.  42.
  42.  66.  88.  86.  40.  34.  54.  64.  62.  54.  80.  38.  36.  68.
  92.  90.  38.  34.  58.  62.  62.  56.  74.  42.  40.  68.  86.  88.
  38.  38.  60.  62.  64.  58.  78.]
number less than minimum syllable duration used to segment:  0

In [77]:
durs_pred_0 = (Y_pred_offsets[0] - Y_pred_onsets[0]) * 1000
print(durs_pred_0)
print("number of segments with duration less than minimum syllable duration used to segment: ", np.sum(durs_pred_0 < min_dur))


[ 36.   2.  38.  76.  78.  84.   2.   0. 102.  90.  84.  40.  38.  52.
  60.  62.  60.   0.  78.  42.  40.  68.  86.  90.  36.  32.  58.  62.
  62.  52.  78.  40.  36.  66.  94.  90.  38.  34.   6.  58.  64.  60.
  54.  78.  42.  40.  68.  88.  90.  38.  40.  58.  64.  62.  58.  80.]
number of segments with duration less than minimum syllable duration used to segment:  5

More than a couple in the predicted onsets array. What about across all the predicted onsets arrays?


In [66]:
durs_pred = []
lt_min_dur = []
for off, on in zip(Y_pred_offsets, Y_pred_onsets):
    durs = (off - on) * 1000
    durs_pred.append(durs)
    lt_min_dur.append(np.sum(durs < min_dur))

In [67]:
print(lt_min_dur)


[5, 3, 5, 7, 2, 5, 1, 4, 3, 1, 9, 8, 1, 2, 31, 4, 2, 2, 9, 6, 5, 24, 5, 8, 4, 3, 6, 2, 4, 2, 3, 0, 2, 7, 8, 9, 8, 1, 5, 5, 8, 8, 9, 1, 2, 4, 0, 5, 6, 1, 1, 11, 3, 2, 3, 3, 9, 8, 3, 2, 18, 3, 4, 2, 10, 24, 3, 7, 0, 7, 6, 4, 3, 8, 2, 7, 1, 5, 7, 5, 1, 9, 2, 3, 11, 10, 0, 2, 22, 7, 62, 8, 5, 1, 10, 6, 3, 13, 11, 3, 14, 14, 3, 4, 30, 5, 7, 1, 2, 8, 6, 4, 5, 19, 4, 8, 2, 3, 9, 2, 2, 7, 3, 16, 1, 89, 5, 4, 5, 10, 4, 15, 3, 5, 8, 0, 2, 3, 3, 4, 8, 5, 3, 7, 5, 3, 2, 6, 2, 1, 6, 6, 5, 2, 5, 3, 1, 9, 7, 5, 6, 2, 7, 6, 10, 6, 1, 2]

Okay and how does that compare to the number of extra segments in each predicted labels array (regardless of whether the segments are less than the minimum duration)?


In [68]:
num_extra = []
for Y_pred_seg, Y_test_seg in zip(Y_pred_labels, Y_test_labels_from_seg):
    num_extra.append(Y_pred_seg.shape[0]-Y_test_seg.shape[0])

In [70]:
print(num_extra)


[7, 5, 5, 12, 4, 5, 1, 4, 3, 4, 10, 10, 1, 3, 36, 5, 2, 2, 9, 7, 6, 32, 7, 10, 4, 3, 8, 2, 5, 2, 3, 0, 2, 12, 14, 10, 9, 1, 6, 5, 7, 9, 9, 1, 3, 4, 0, 5, 10, 1, 2, 11, 4, 2, 4, 3, 13, 8, 3, 2, 20, 4, 5, 2, 11, 39, 3, 7, 0, 8, 7, 4, 3, 9, 2, 7, 1, 10, 8, 5, 1, 9, 2, 3, 14, 11, 0, 2, 23, 7, 86, 9, 5, 1, 12, 6, 3, 14, 12, 3, 19, 16, 3, 4, 44, 6, 7, 1, 2, 9, 7, 5, 6, 20, 5, 8, 2, 3, 9, 4, 3, 8, 3, 19, 0, 102, 5, 4, 6, 10, 4, 16, 3, 5, 8, 0, 2, 3, 4, 4, 8, 5, 3, 7, 5, 4, 2, 6, 2, 1, 6, 6, 6, 2, 6, 3, 1, 9, 8, 5, 8, 3, 8, 8, 10, 8, 1, 3]

Hmm, looks similar.

So what if we filtered out all the segments less than the minimum duration?


In [71]:
num_extra_minus_num_lt_min = [extra - lt_dur for extra, lt_dur in zip(num_extra, lt_min_dur)]

In [72]:
print(num_extra_minus_num_lt_min)


[2, 2, 0, 5, 2, 0, 0, 0, 0, 3, 1, 2, 0, 1, 5, 1, 0, 0, 0, 1, 1, 8, 2, 2, 0, 0, 2, 0, 1, 0, 0, 0, 0, 5, 6, 1, 1, 0, 1, 0, -1, 1, 0, 0, 1, 0, 0, 0, 4, 0, 1, 0, 1, 0, 1, 0, 4, 0, 0, 0, 2, 1, 1, 0, 1, 15, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 5, 1, 0, 0, 0, 0, 0, 3, 1, 0, 0, 1, 0, 24, 1, 0, 0, 2, 0, 0, 1, 1, 0, 5, 2, 0, 0, 14, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 1, 1, 0, 3, -1, 13, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 1, 1, 2, 0, 2, 0, 1]

In [79]:
np.asarray(num_extra_minus_num_lt_min).mean()


Out[79]:
1.1488095238095237

Looks like we'd do a lot better overall, although in a couple cases we get less than the number of syllables in the test set (?)