In [1]:
# Code source: Brian McFee
# License: ISC
import numpy as np
import scipy
import sklearn.cluster
import librosa
import librosa.display
import matplotlib.pyplot as plt
import matplotlib.style as ms
ms.use('seaborn-muted')
%matplotlib inline
In [2]:
file_path = "../data/songData/genres/disco/disco.00000.wav"
y, sr = librosa.load(file_path)
In [3]:
BINS_PER_OCTAVE = 12 * 3
N_OCTAVE = 7
C = librosa.amplitude_to_db(librosa.cqt(y=y, sr=sr,
bins_per_octave=BINS_PER_OCTAVE,
n_bins=N_OCTAVE * BINS_PER_OCTAVE),
ref=np.max)
plt.figure(figsize=(12, 4))
librosa.display.specshow(C, y_axis='cqt_hz', sr=sr,
bins_per_octave=BINS_PER_OCTAVE,
x_axis='time')
plt.title('log power cqt')
plt.tight_layout()
In [4]:
tempo, beats = librosa.beat.beat_track(y=y, sr=sr, trim=False)
Csync = librosa.util.sync(C, beats, aggregate=np.median)
beat_times = librosa.frames_to_time(librosa.util.fix_frames(beats,
x_min=0,
x_max=C.shape[1]),
sr=sr)
plt.figure(figsize=(12, 4))
librosa.display.specshow(Csync, bins_per_octave=12*3,
y_axis='cqt_hz', x_axis='time',
x_coords=beat_times)
plt.tight_layout()
In [5]:
R = librosa.segment.recurrence_matrix(Csync, width=3, mode='affinity', sym=True)
df = librosa.segment.timelag_filter(scipy.ndimage.median_filter)
Rf = df(R, size=(1, 7))
In [6]:
mfcc = librosa.feature.mfcc(y=y, sr=sr)
Msync = librosa.util.sync(mfcc, beats)
path_distance = np.sum(np.diff(Msync, axis=1)**2, axis=0)
sigma = np.median(path_distance)
path_sim = np.exp(-path_distance / sigma)
R_path = np.diag(path_sim, k=1) + np.diag(path_sim, k=-1)
In [7]:
deg_path =np.sum(R_path, axis=1)
deg_rec = np.sum(Rf, axis=1)
mu = deg_path.dot(deg_path + deg_rec) / np.sum((deg_path + deg_rec)**2)
A = mu * Rf + (1 - mu) * R_path
In [8]:
plt.figure(figsize=(8, 4))
plt.subplot(1, 3, 1)
librosa.display.specshow(Rf, cmap='inferno_r', y_axis='time', y_coords=beat_times)
plt.title('Reccurence Similarity')
plt.subplot(1, 3, 2)
librosa.display.specshow(R_path, cmap='inferno_r')
plt.title('Path similarity')
plt.subplot(1, 3, 3)
librosa.display.specshow(A, cmap='inferno_r')
plt.title('Conbined graph')
plt.tight_layout()
In [9]:
L = scipy.sparse.csgraph.laplacian(A, normed=True)
evals, evecs = scipy.linalg.eigh(L)
evecs = scipy.ndimage.median_filter(evecs, size=(9, 1))
Cnorm = np.cumsum(evecs**2, axis=1)**0.5
k =5
X = evecs[:, :k] / Cnorm[:, k-1:k]
plt.figure(figsize=(8,4))
plt.subplot(1, 2, 2)
librosa.display.specshow(Rf, cmap='inferno_r')
plt.title('Reccurence matrix')
plt.subplot(1, 2, 1)
librosa.display.specshow(X,
y_axis='time',
y_coords=beat_times)
plt.title('Structure components')
plt.tight_layout()
In [10]:
KM = sklearn.cluster.KMeans(n_clusters=k)
seg_ids = KM.fit_predict(X)
plt.figure(figsize=(12, 4))
colors = plt.get_cmap('Paired', k)
plt.subplot(1, 3, 2)
librosa.display.specshow(Rf, cmap='inferno_r')
plt.title('Reccurence matrix')
plt.subplot(1, 3, 1)
librosa.display.specshow(X,
y_axis='time',
y_coords=beat_times)
plt.title('Structure components')
plt.subplot(1, 3, 3)
librosa.display.specshow(np.atleast_2d(seg_ids).T, cmap=colors)
plt.title('Estimated segments')
plt.tight_layout()
In [11]:
bound_beats = 1 + np.flatnonzero(seg_ids[:-1] != seg_ids[1:])
bound_beats = librosa.util.fix_frames(bound_beats, x_min=0)
bound_segs = list(seg_ids[bound_beats])
bound_frames = beats[bound_beats]
bound_frames = librosa.util.fix_frames(bound_frames,
x_min=None,
x_max=C.shape[1]-1)
In [ ]:
import matplotlib.patches as patches
plt.figure(figsize=(12, 4))
bound_times = librosa.frames_to_time(bound_frames)
freqs = librosa.cqt_frequencies(n_bins=C.shape[0],
fmin=librosa.note_to_hz('C1'),
bins_per_octave=BINS_PER_OCTAVE)
librosa.display.specshow(C, x_axis='time', y_axis='cqt_hz',
sr=sr, bins_per_octave=BINS_PER_OCTAVE)
ax = plt.gca()
for interval, label in zip(zip(bound_times, bound_times[1:]), bound_segs):
ax.add_patch(patches.Rectangle((interval[0], freqs[0]),
interval[1] - interval[0],
freqs[-1],
facecolor=colors(label),
alpha=0.5))
plt.tight_layout()
plt.show()