In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

casa.py example notebook

This notebook runs through the basic functionality of the current casa.py implementation.

This is calculating the corellogram (subband short-time autocorrelation), estimating pitch-related masks by sampling lags within the corellogram, and masked resynthesis to attempt to separate sources from a mixture.

This package relies on the calc_sbpca being installed in a sibling directory ../calc_sbca/.

It also needs librosa installed.

Dan Ellis dpwe@ee.columbia.edu 2015-04-02


In [2]:
import numpy as np
import IPython

In [3]:
sys.path.append('../calc_sbpca/python')

In [4]:
import librosa
import sbpca
import SAcC


autoco_ext= False

In [5]:
import casa

In [6]:
# These are two CRM utterances - one male, one female.  They are exactly the same length.
input_wav_file_1 = 'crm-11737.wav'
input_wav_file_2 = 'crm-16515.wav'
d1, sr = SAcC.readwav(input_wav_file_1)
d2, sr = SAcC.readwav(input_wav_file_2)
dmix = d1 + d2
print sr, np.shape(d1), np.shape(d2)


16000 (28476,) (28476,)

In [7]:
# This should provide a playback widget
IPython.lib.display.Audio(dmix, rate=sr)


Out[7]:

In [8]:
# To create "oracle" pitch tracks of the two voices in the mixture, 
# we'll create & run an SAcC pitch tracker.
sacc_extractor = SAcC.SAcC(SAcC.default_config())
pitches_1 = sacc_extractor(input_wav_file_1)
pitches_2 = sacc_extractor(input_wav_file_2)
# `pitches` consists of time_frames x (time_sec, pitch_hz, voicing_probability).
figure(figsize=(20,4))
plot(pitches_1[:,0], pitches_1[:,1], pitches_2[:,0], pitches_2[:,1])
# The female voice (green line) has a pitch roughly double that of the male.
legend(['crm-11737 (M)', 'crm-16515 (F)'])


Out[8]:
<matplotlib.legend.Legend at 0x10bba7990>

In [9]:
# We can look at the conventional spectrograms of the components and their mix.
n_fft = 512 # 32 ms at SR=16 kHz
hop_length = 160 # 10 ms at SR=16 kHz
D1 = np.abs(librosa.stft(d1, n_fft=n_fft, hop_length=hop_length))
D2 = np.abs(librosa.stft(d2, n_fft=n_fft, hop_length=hop_length))
DMIX = np.abs(librosa.stft(dmix, n_fft=n_fft, hop_length=hop_length))
figure(figsize=(20,6))
maxbin = 200
# Mixture appears in the top pane.
imshow(20.*np.log10(np.vstack([D1[:maxbin, :], D2[:maxbin, :], DMIX[:maxbin, :]])), 
       origin="bottom", interpolation="nearest", aspect="auto", vmin=-80)
colorbar()


-c:10: RuntimeWarning: divide by zero encountered in log10
Out[9]:
<matplotlib.colorbar.Colorbar instance at 0x10c2bb638>

In [10]:
# Set up the CASA analysis engine.
# Configure filter bank to have narrower bands than the default - for improved separation.
config = casa.test_config
config["fbank_num_bands"] = 72
config["fbank_bpo"] = 12
config["fbank_q"] = 32
print config


{'fbank_num_bands': 72, 'sample_rate': 16000, 'ac_win_sec': 0.025, 'fbank_q': 32, 'ac_hop_sec': 0.01, 'fbank_order': 2, 'fbank_min_freq': 125, 'fbank_bpo': 12}

In [11]:
# The CASA "object" provides functions using the filterbank configuration etc.
casobj = casa.casa(config)

In [12]:
# Create the autocorrelogram of the mixture
acg = casobj.correlogram(dmix)
# Shape is subbands x lags x time frames (10ms step)
print acg.shape


(72, 400, 173)

In [13]:
# The 0th lag is a special case, it shows the overall energy in each t, f cell
# i.e. an auditory-scale spectrogram.
figure(figsize=(20,4))
imshow(10*np.log10(acg[:,0,:]), origin="bottom", interpolation="nearest", aspect="auto")
colorbar()


Out[13]:
<matplotlib.colorbar.Colorbar instance at 0x10cf04dd0>

In [14]:
# The correlogram is a 3D volume.  
# If we slice at a single time, we can see the short-time autocorrelations.  
# Frame 40 (t=0.4 s) has a strong mixture of pitches.
frame = 40
# Find the lags corresponding to the two ground-truth pitches at this frame.
# The autocorrelation is forward-looking, so delay the pitch estimates by a couple of 10ms frames.
pitch_delay = 2
expected_lag_1 = np.round(sr/pitches_1[frame + pitch_delay, 1])
expected_lag_2 = np.round(sr/pitches_2[frame + pitch_delay, 1])
figure(figsize=(20,4))
imshow((acg[:, :, frame]), origin="bottom", interpolation="nearest", aspect="auto")
hold(True)
# White vertical stripes indicate the corresponding pitches, which is where we'll sample
# when trying to build the separation masks.  They sort-of line up with correlation peaks.
plot([[expected_lag_1, expected_lag_2], [expected_lag_1, expected_lag_2]], [[0,0],[72, 72]], 'w')
axis([0, 400, 0, 72])
colorbar()


Out[14]:
<matplotlib.colorbar.Colorbar instance at 0x10dbcacf8>

In [15]:
# Now we can construct masks by sampling at the pitches.
# The autocorrelogram is forward looking, so we delay the pitch by a couple of 10ms steps.
num_acg_frames = acg.shape[-1]
env_1 = casobj.env_for_pitch(acg, pitches_1[pitch_delay:pitch_delay + num_acg_frames, 1])
env_2 = casobj.env_for_pitch(acg, pitches_2[pitch_delay:pitch_delay + num_acg_frames, 1])

In [16]:
# Look at the actual masks.  
# The top pane is the mask for the female voice, you can see it picks out a higher fundamental.
figure(figsize=(20,4))
imshow(np.vstack([env_1, env_2]), 
       origin="bottom", interpolation="nearest", aspect="auto")
colorbar()


Out[16]:
<matplotlib.colorbar.Colorbar instance at 0x111d56320>

In [17]:
# We can build "hard" masks by seeing which soft mask is bigger in each TF cell
# and we can restrict it to higher-confidence cells by requiring the mask to be larger 
# by a factor.
overfactor = 2.0
m_1 = env_1 > overfactor*env_2
m_2 = env_2 > overfactor*env_1
# All the cells not included in either mask.
m_0 = 1 - np.maximum(m_1, m_2)
# `slient_bands` can be used to suppress some channels in resynthesis
# to check the contribution of individual bands.
silent_bands = [] # [x for x in range(72) if x is not 50]
raudio_1 = casobj.apply_tf_mask(d1+d2, m_1, silent_bands)
raudio_2 = casobj.apply_tf_mask(d1+d2, m_2, silent_bands)


n_audio/win_samps= 177.975 n_frame= 173 win_samps= 160
n_audio/win_samps= 177.975 n_frame= 173 win_samps= 160

In [18]:
# Listen to the masked resynthesis - Male.
# Note: because this is pitch-based separation, there are no fricatives at all.
IPython.lib.display.Audio(raudio_1, rate=sr)


Out[18]:

In [19]:
# .. and female
IPython.lib.display.Audio(raudio_2, rate=sr)


Out[19]:

In [21]:
# Look at the hard masks.
figure(figsize=(20,4))
imshow(np.vstack([m_1, m_2]), 
       origin="bottom", interpolation="nearest", aspect="auto")
colorbar()


Out[21]:
<matplotlib.colorbar.Colorbar instance at 0x10e1c77e8>

In [22]:
# Look at the conventional spectrograms of the resyntheses.  
# There's a lot of spread from each subband, so the hard masks end up not being very selective.
DM = np.abs(librosa.stft(d1 + d2, n_fft=n_fft, hop_length=hop_length))
DR1 = np.abs(librosa.stft(raudio_1, n_fft=n_fft, hop_length=hop_length))
DR2 = np.abs(librosa.stft(raudio_2, n_fft=n_fft, hop_length=hop_length))
figure(figsize=(20,8))
imshow(20.*np.log10(np.vstack([DM[:200, :], DR1[:200, :], DR2[:200, :]])), 
       origin="bottom", interpolation="nearest", aspect="auto", vmin=-80)
colorbar()


-c:7: RuntimeWarning: divide by zero encountered in log10
Out[22]:
<matplotlib.colorbar.Colorbar instance at 0x10e5e7518>