Cross-repetition dtw

  • Idea: we need strong positive examples for training repetition detection
  • Approach

    • Get labeled segmentations
    • For each track, and each pair of sections within a track, run dtw
    • See if we can threshold the dtw cost to determine whether or not two sections are repetitions of one-another
    • If so, take the path as a set of ground-truth positives
    • Sample negatives iid at random
    • win everything
  • Extensions

    • In the downstream classifier, use windows of beat-sync features
    • That way we can get some smoothing and local context

In [8]:
import librosa
import dtw
import matplotlib.pyplot as plt
import seaborn
seaborn.set(style='dark')
%matplotlib inline

import numpy as np
import scipy
import mir_eval
import mpld3

import msaf

In [9]:
from IPython.html.widgets import interact

In [10]:
def lab_to_aud(fname):
    
    fname = fname.replace('seglab_tut', 'audio')
    fname = fname.replace('.lab', '.flac')
    
    return fname

In [11]:
def make_cost_matrix(audio_file, intervals, labels, dist):
    
    y, sr = librosa.load(audio_file)
    
    cqgram = librosa.logamplitude(librosa.cqt(y, sr=sr)**2, ref_power=np.max)
    
    # convert intervals to frames
    intframes = librosa.time_to_frames(intervals)
    
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr, trim=False)
    
    # Sub-divide
    subseg = librosa.segment.subsegment(cqgram, beats)
    subseg = librosa.util.fix_frames(subseg, x_min=0, x_max=cqgram.shape[1])
    
    # Synchronize
    cqgram = librosa.feature.sync(cqgram, subseg, aggregate=np.median)
    
    # Match intervals to subseg points
    intframes = librosa.util.match_events(intframes, subseg)

    # Score matrix
    D = np.nan * np.zeros((len(labels), len(labels)), dtype=np.float32)
    np.fill_diagonal(D, 0)
    
    # Path matrix
    P = []
    for i in range(len(labels)):
        P.append([np.nan] * len(labels))
    for i in range(len(labels)):
        P[i][i] = 0
    
    # Compute DTW scores and paths
    for i in range(len(labels)):
        x_slice = cqgram[:, intframes[i, 0]:intframes[i, 1]].T
        if intframes[i, 1] - intframes[i, 0] < 2:
            continue
        for j in range(i+1, len(labels)):
            if intframes[j, 1] - intframes[j, 0] < 2:
                continue
            y_slice = cqgram[:, intframes[j, 0]:intframes[j, 1]].T
            
            dtw_cost, distance, path = dtw.dtw(x_slice, y_slice, dist=dist)
            D[i, j] = dtw_cost
            D[j, i] = D[i, j]
            path = list(path)
            path[0] = np.asarray(path[0], dtype=np.int32)
            path[1] = np.asarray(path[1], dtype=np.int32)
            P[i][j] = path
            
    return D, P

In [ ]:
lab_files = librosa.util.find_files('/home/bmcfee/data/beatles_iso/seglab_tut/', ext='lab')
k = 39
lab_files[k]

In [ ]:
audio_file = lab_to_aud(lab_files[k])
intervals, labels = mir_eval.io.load_labeled_intervals(lab_files[k])

In [12]:
ds_path = "/home/uri/datasets/BeatlesTUT/"
file_structs = msaf.io.get_dataset_files(ds_path)
k = 39
audio_file = file_structs[k].audio_file
intervals, labels = msaf.jams2.converters.load_jams_range(
            file_structs[k].ref_file, "sections", annotator=0, context="function")

In [13]:
_D, _P = make_cost_matrix(audio_file, intervals, labels, scipy.spatial.distance.correlation)
boundaries = intervals

In [14]:
duration = np.diff(boundaries).ravel()

In [15]:
douter = np.minimum.outer(duration, duration)

In [16]:
np.set_printoptions(precision=3)

In [17]:
D = _D * douter**(-1.0)

In [18]:
D = _D

In [19]:
seaborn.heatmap(D, xticklabels=labels, yticklabels=labels)


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ff4bf7e7a10>

In [20]:
mpld3.disable_notebook()
maxval = float(D[np.isfinite(D)].max())

@interact(threshold=(0, maxval, maxval/20.0))
def _plot(threshold=0):
    plt.figure(figsize=(5,4))
    seaborn.heatmap((D<=threshold), xticklabels=labels, yticklabels=labels, vmin=0, vmax=1)
    plt.tight_layout()



In [21]:
import mpld3
import numpy as np
import matplotlib.pyplot as plt
import seaborn
seaborn.set(style='dark')
from IPython.html.widgets import interact

%matplotlib inline

mpld3.disable_notebook()

dummy_data = np.random.random((10, 10))
@interact(threshold=(0, 1, 0.05))
def _plot(threshold=0):
    plt.figure(figsize=(5,4))
    seaborn.heatmap((dummy_data<=threshold), vmin=0, vmax=1)
    plt.tight_layout()




In [272]:
lab_file = lab_files[k]

y, sr = librosa.load(lab_to_aud(lab_file))
#y = librosa.effects.harmonic(y)

intervals, labels = mir_eval.io.load_labeled_intervals(lab_file)
    
cqgram = librosa.logamplitude(librosa.cqt(y, sr=sr)**2, ref_power=np.max)
#cqgram = scipy.stats.zscore(cqgram, axis=1)

In [273]:
import IPython.display

In [274]:
IPython.display.Audio(data=y, rate=sr)


Out[274]:

In [275]:
import mpld3

In [276]:
mpld3.enable_notebook()

In [277]:
plt.figure(figsize=(12,4))
librosa.display.specshow(cqgram)
plt.colorbar()
plt.tight_layout()