NMF Source Separation

Piano notes consist of a very stable stack of harmonics for each note, with a smooth decay. This kind of sound event can be well captured by separable time and frequency components, i.e., a single spectral profile modulated in time by the decaying envelope of each note event. NMF (Nonnegative Matrix Factorization) tries to find the best decomposition of a matrix into a fixed number of spectrum/envelope pairs. Under the right circumstances, these components can end up being individual piano notes - although controlling it can be tricky. This practical uses the NMF implementation from sklearn (which is wrapped by librosa) to decompose a brief excerpt of piano.


In [1]:
%pylab inline
from __future__ import print_function
import os
import time
import IPython

import numpy as np

import librosa


Populating the interactive namespace from numpy and matplotlib

In [2]:
def my_specshow(S, sr, t_max=None, f_max=None, **kwargs):
    """Call librosa.specshow but limit the time and frequency ranges."""
    librosa.display.specshow(S, sr=sr, cmap='jet', x_axis='time', y_axis='linear', **kwargs)
    if t_max == None:
        t_max = librosa.frames_to_time(S.shape[1])
    tick_frames = librosa.time_to_frames(np.arange(round(t_max) + 1.0))
    plt.xlim([0, tick_frames[-1]])
    if f_max == None:
        f_max = S.shape[0] * sr / n_fft
    freqs = librosa.fft_frequencies()
    max_bin = np.flatnonzero(freqs >= f_max)[0]
    plt.ylim([0, max_bin])
    librosa.display.time_ticks(tick_frames, librosa.frames_to_time(tick_frames), n_ticks=len(tick_frames))
    plt.xlabel('Time')
    librosa.display.frequency_ticks(freqs[:max_bin], axis='y')
    plt.ylabel('Hz');

In [3]:
y, sr = librosa.load('gould-wtc1.wav', sr=None)
print(sr, y.shape)


11025 (330750,)

In [4]:
# STFT analysis of the piano excerpt.
n_fft = 1024
S = librosa.stft(y, n_fft=n_fft)
plt.figure(figsize=(10, 4))
# Zoom in on the first few notes, and the lowest few harmonics.
t_max = 5.0
f_max = 2000.0
my_specshow(librosa.logamplitude(S), sr=sr, t_max=t_max, f_max=f_max)
plt.colorbar()
IPython.display.Audio(y, rate=sr)


Out[4]:

In [5]:
# Filter out the first three notes by a low-pass filter, applied directly
# on the spectrogram.  It looks like cutting at 450 Hz should do it, so 
# zero out all the bins above that.
cutoff_bin = int(round(450.0 / sr * n_fft))
S_lpf = S.copy()
S_lpf[cutoff_bin:] = 0
# Convert spectrogram back to audio.
y_lpf = librosa.istft(S_lpf)
plt.figure(figsize=(10, 4))
# Re-analyze back into STFT
my_specshow(librosa.logamplitude(librosa.stft(y_lpf, n_fft=n_fft)), sr=sr, t_max=t_max, f_max=f_max)
plt.colorbar()
plt.clim([-56, 12])
IPython.display.Audio(y_lpf, rate=sr)


Out[5]:

In [6]:
# The LPF removes the higher notes, but using an HPF doesn't do 
# so well to remove the lower notes, because their higher harmonics 
# still come through.
S_hpf = S.copy()
S_hpf[:cutoff_bin] = 0
y_hpf = librosa.istft(S_hpf)
plt.figure(figsize=(10, 4))
my_specshow(librosa.logamplitude(librosa.stft(y_hpf, n_fft=n_fft)), 
            sr=sr, t_max=t_max, f_max=f_max)
plt.xlim([0, librosa.time_to_frames(t_max)])
plt.ylim([0, f_max / float(sr) * n_fft])
plt.colorbar()
plt.clim([-56, 12])
IPython.display.Audio(y_hpf, rate=sr)


Out[6]:

NMF Decomposition

librosa.decompose.decompose uses sklearn.decomposition.NMF to approximate the spectrogram as rank-1 spectrum x activation components. We want one component per note; there are a few dozen distinct notes in the excerpt, so we attempt to set n_components to provide one component for each.


In [7]:
# Perform NMF decomposition.
n_components = 40
# alpha is the weight for the regularization.  With alpha as zero, the components simply 
# try to minimize the error in reproducing the spectrogram.  l1_ratio controls the balance
# between l2 (total energy) and l1 (sparsity) in the regularization
W, H = librosa.decompose.decompose(np.abs(S), n_components=n_components, sort=True, 
                                   alpha=0.0, l1_ratio=0.0)
# W gives one spectral profile per component, and H gives the corresponding time activations.
print(W.shape, H.shape)


(513, 40) (40, 1292)

In [8]:
# Look at the individual spectral slices in W.
# decompose() sorts the W slices by the frequency of their max-magnitude bin, 
# usually the fundamental.
plt.figure(figsize=(10, 8))
_ = plt.plot(0.5 * np.log10(np.maximum(1, 10*W)) + np.arange(n_components)[np.newaxis, :], np.arange(513))
plt.ylim([0, 200])
plt.yticks(plt.yticks()[0], np.round(sr * plt.yticks()[0] / n_fft))
plt.ylabel('Hz')
plt.xlabel('component')


Out[8]:
<matplotlib.text.Text at 0x1052df810>

In [9]:
# The dot-product of W and H gives the complete approximated reconstruction of the input 
# spectrogram.  With enough components, it looks close; look at them side-by-side
plt.figure(figsize=(10, 8))
plt.subplot(311)
my_specshow(librosa.logamplitude(S), sr=sr, t_max=t_max, f_max=f_max)
plt.subplot(312)
my_specshow(librosa.logamplitude(np.dot(W, H)), sr=sr, t_max=t_max, f_max=f_max)
# Print the actual mean-squared error
print("MSE=", np.mean((np.abs(S) - np.dot(W, H))**2))
# Plot the activations on the same scale, to see how they line up with note events.
plt.subplot(313)
my_specshow(np.abs(H), sr=sr, t_max=t_max)
plt.yticks(np.arange(0, n_components, 5), np.arange(0, n_components, 5))
plt.ylabel('component')


MSE= 0.00145519867136
Out[9]:
<matplotlib.text.Text at 0x1137db550>

In [10]:
# Show the individual contributions of the first 9 components
# and resynthesize by masking the original spectrogram with their
# proportion of the total approximation.
masks = np.zeros((n_components, S.shape[0], S.shape[1]))
for n in range(n_components):
    masks[n] = np.dot(W[:,[n]], H[[n]])
mask_total = np.sum(masks, axis=0)
mask_total += (mask_total == 0)
num_to_show = 8
plt.figure(figsize=(12, 6))
for n in range(num_to_show):
    plt.subplot(3, 3, n + 1)
    my_specshow(librosa.logamplitude(masks[n]), sr=sr, t_max=t_max, f_max=f_max)
    plt.xlim([0, librosa.time_to_frames(t_max)])
    plt.ylim([0, f_max / float(sr) * n_fft])
    title("component {}".format(n))
    clim([-30, 10])
    Y_R = masks[n] / mask_total * S
    y_r = librosa.istft(Y_R)
    IPython.display.display(IPython.display.Audio(y_r, rate=sr))
# In the final pane, plot the sum of all the preceding panes.
plt.subplot(3, 3, num_to_show + 1)
partial_sum = np.sum(masks[range(num_to_show)], axis=0) 
my_specshow(librosa.logamplitude(partial_sum), sr=sr, t_max=t_max, f_max=f_max)
title("partial sum")
clim([-30, 10])
Y_R = partial_sum / mask_total * S
y_r = librosa.istft(Y_R)
IPython.display.display(IPython.display.Audio(y_r, rate=sr))



In [11]:
# Which components actually encode the first note?
_ = plt.plot(H[:, :5])



In [12]:
# Reconstruct based only the lowest-fundamental components (plus a few higher ones that seem invovled).
num_low_components = 10
extras = num_low_components + np.nonzero(np.max(H[num_low_components:, :5], axis=1) > 0.2)[0]
print(extras)
components_to_use = range(num_low_components)
components_to_use.extend(extras.tolist())
mask = np.dot(W[:, components_to_use], H[components_to_use])
Y_R = mask / mask_total * S
plt.figure(figsize=(15, 4))
my_specshow(librosa.logamplitude(Y_R), sr=sr, t_max=t_max, f_max=f_max)
y_r = librosa.istft(Y_R)
IPython.display.display(IPython.display.Audio(y_r, rate=sr) )


[18 29 30 31]

In [13]:
# Reconstruct using all the remaining components - the other notes.
remaining_components = range(n_components)
for component in components_to_use:
    remaining_components.remove(component)
mask = np.dot(W[:, remaining_components], H[remaining_components])
Y_R = mask / mask_total * S
plt.figure(figsize=(15, 4))
my_specshow(librosa.logamplitude(Y_R), sr=sr, t_max=t_max, f_max=f_max)
y_r = librosa.istft(Y_R)
IPython.display.display( IPython.display.Audio(y_r, rate=sr) )



In [ ]: