In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from librosa import cqt

os.chdir('..')  # ONLY EXECUTE ONCE PER SESSION!
from music_transcription.onset_detection.read_data import read_X
from music_transcription.onset_detection.cnn_onset_detection import CnnOnsetDetector


Using Theano backend.
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10).  Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 980M (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5005)

In [2]:
wavpath = r'data\recordings\audio\mim-riff1-short-slow.wav'

Current Spectrogram: FFT Sizes (4096, 2048, 1024)


In [3]:
onset_detector = CnnOnsetDetector.from_zip('models/onset_detection/20170601-3-channels_ds1-4_80-perc_adjusted-labels_with_config.zip')
sg = onset_detector.feature_extractor._read_and_extract([wavpath])
np.array(sg).shape


Out[3]:
(3, 1993, 80)

In [5]:
for spectrogram in sg:
    plt.figure(figsize=(12,6))
    plt.imshow(np.array(spectrogram).T, aspect=8, origin='lower', cmap=plt.cm.spectral)


Constant Q Transform (CQT)


In [4]:
fr_hz = onset_detector.feature_extractor.frame_rate_hz
sr = onset_detector.feature_extractor.sample_rate
subs = onset_detector.feature_extractor.subsampling_step
samples, _ = read_X(wavpath, fr_hz, sr, subs)
print(samples.shape)
samples = samples.ravel()
print(samples.shape)


(2001, 441)
(882441,)

Using all default values


In [62]:
cqt_spect = cqt(samples, sr=sr)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(12,6))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
yt = np.arange(0, cqt_spect.shape[0], 5)
_ = plt.yticks(yt, yt+24) # range starts at midi pitch 24


Refined approach

midipitch 33-93, using 1,2 and 3 bins per note
no scaling (seems to amplify harmonic notes (octaves)

Default values used:

  • filter_scale=1, # Filter scale factor. Small values (<1) use shorter windows for improved time resolution. (same as using different hop-sizes?)
  • norm=1, # Type of norm to use for basis function normalization.
  • sparsity=0.01, # Sparsify the CQT basis by discarding up to sparsity fraction of the energy in each basis. 0 = off
  • window='hann'
  • pad_mode='reflect'

In [5]:
hopl = 512 # must be 2^n cannot be 441 (= subs)
n_bins=60
lowfreq = 55.0 # A1 (midi 33) # gtr E 82.4068892282
# highfreq = lowfreq * 2**(n_bins/12)
cqt_spect = cqt(samples, sr=sr, hop_length=hopl, fmin=lowfreq, n_bins=n_bins, scale=False)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,6))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('1 bin per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 5)
_ = plt.yticks(yt, yt+33) # range starts at midi pitch 33


cqt_spect = cqt(samples, sr=sr, hop_length=hopl, fmin=lowfreq, n_bins=n_bins*2, bins_per_octave=24, scale=False)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,6))
plt.imshow(cqt_spect, aspect=4, origin='lower', cmap=plt.cm.spectral)
plt.title('2 bins per note, aspect=4 (scaled down  y axis to match the size of the 1 bin plot)')
yt = np.arange(0, cqt_spect.shape[0], 10)
_ = plt.yticks(yt, (yt/2+33).astype('int')) # range starts at midi pitch 33

plt.figure(figsize=(24,12))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('2 bins per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 4)
_ = plt.yticks(yt, (yt/2+33).astype('int')) # range starts at midi pitch 33


cqt_spect = cqt(samples, sr=sr, hop_length=hopl, fmin=lowfreq, n_bins=n_bins*3, bins_per_octave=36, scale=False)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,9))
plt.imshow(cqt_spect, aspect=4, origin='lower', cmap=plt.cm.spectral)
plt.title('3 bins per note, aspect=4 (scaled down y axis)')
yt = np.arange(0, cqt_spect.shape[0], 9)
_ = plt.yticks(yt, (yt/3+33).astype('int')) # range starts at midi pitch 33

plt.figure(figsize=(24,18))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('3 bins per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 3)
_ = plt.yticks(yt, (yt/3+33).astype('int')) # range starts at midi pitch 33



In [25]:
cqt_spect = cqt(samples, sr=sr, hop_length=hopl, fmin=lowfreq, n_bins=n_bins*3, bins_per_octave=36, scale=False)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

print(cqt_spect.shape)

plt.figure(figsize=(12,12))
plt.imshow(cqt_spect[:, 0:86], aspect=1/4, origin='lower', cmap=plt.cm.Greys)
plt.title('First second with three onsets')
yt = np.arange(0, cqt_spect.shape[0], 3)
_ = plt.yticks(yt, (yt/3+33).astype('int')) # range starts at midi pitch 33
_ = plt.axvline(sr / hopl * 0.293, color='red')
_ = plt.axvline(sr / hopl * 0.617, color='red')
_ = plt.axvline(sr / hopl * 0.96, color='red')


(180, 1724)

half hopsize


In [79]:
hopl = 256

cqt_spect = cqt(samples, sr=sr, hop_length=hopl, fmin=lowfreq, n_bins=n_bins, scale=False)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,6))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('1 bin per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 5)
_ = plt.yticks(yt, yt+33) # range starts at midi pitch 33


cqt_spect = cqt(samples, sr=sr, hop_length=hopl, fmin=lowfreq, n_bins=n_bins*2, bins_per_octave=24, scale=False)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,6))
plt.imshow(cqt_spect, aspect=4, origin='lower', cmap=plt.cm.spectral)
plt.title('2 bins per note, aspect=4 (scaled down  y axis to match the size of the 1 bin plot)')
yt = np.arange(0, cqt_spect.shape[0], 10)
_ = plt.yticks(yt, (yt/2+33).astype('int')) # range starts at midi pitch 33

plt.figure(figsize=(24,12))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('2 bins per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 4)
_ = plt.yticks(yt, (yt/2+33).astype('int')) # range starts at midi pitch 33


cqt_spect = cqt(samples, sr=sr, hop_length=hopl, fmin=lowfreq, n_bins=n_bins*3, bins_per_octave=36, scale=False)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,9))
plt.imshow(cqt_spect, aspect=4, origin='lower', cmap=plt.cm.spectral)
plt.title('3 bins per note, aspect=4 (scaled down y axis)')
yt = np.arange(0, cqt_spect.shape[0], 9)
_ = plt.yticks(yt, (yt/3+33).astype('int')) # range starts at midi pitch 33

plt.figure(figsize=(24,18))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('3 bins per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 3)
_ = plt.yticks(yt, (yt/3+33).astype('int')) # range starts at midi pitch 33


double hopsize


In [80]:
hopl = 1024

cqt_spect = cqt(samples, sr=sr, hop_length=hopl, fmin=lowfreq, n_bins=n_bins, scale=False)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,6))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('1 bin per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 5)
_ = plt.yticks(yt, yt+33) # range starts at midi pitch 33


cqt_spect = cqt(samples, sr=sr, hop_length=hopl, fmin=lowfreq, n_bins=n_bins*2, bins_per_octave=24, scale=False)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,6))
plt.imshow(cqt_spect, aspect=4, origin='lower', cmap=plt.cm.spectral)
plt.title('2 bins per note, aspect=4 (scaled down  y axis to match the size of the 1 bin plot)')
yt = np.arange(0, cqt_spect.shape[0], 10)
_ = plt.yticks(yt, (yt/2+33).astype('int')) # range starts at midi pitch 33

plt.figure(figsize=(24,12))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('2 bins per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 4)
_ = plt.yticks(yt, (yt/2+33).astype('int')) # range starts at midi pitch 33


cqt_spect = cqt(samples, sr=sr, hop_length=hopl, fmin=lowfreq, n_bins=n_bins*3, bins_per_octave=36, scale=False)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,9))
plt.imshow(cqt_spect, aspect=4, origin='lower', cmap=plt.cm.spectral)
plt.title('3 bins per note, aspect=4 (scaled down y axis)')
yt = np.arange(0, cqt_spect.shape[0], 9)
_ = plt.yticks(yt, (yt/3+33).astype('int')) # range starts at midi pitch 33

plt.figure(figsize=(24,18))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('3 bins per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 3)
_ = plt.yticks(yt, (yt/3+33).astype('int')) # range starts at midi pitch 33


4 bins per note?


In [122]:
cqt_spect = cqt(samples, sr=sr, hop_length=512, fmin=lowfreq, n_bins=n_bins*4, scale=False, bins_per_octave=48)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,6))
plt.imshow(cqt_spect, aspect=2, origin='lower', cmap=plt.cm.spectral)
plt.title('4 bins per note, aspect=2')
yt = np.arange(0, cqt_spect.shape[0], 8)
_ = plt.yticks(yt, (yt/4+33).astype('int')) # range starts at midi pitch 33


plt.figure(figsize=(24,24))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('4 bins per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 4)
_ = plt.yticks(yt, (yt/4+33).astype('int')) # range starts at midi pitch 33


even 5 bins per octave???


In [126]:
cqt_spect = cqt(samples, sr=sr, hop_length=512, fmin=lowfreq, n_bins=n_bins*5, scale=False, bins_per_octave=60)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,6))
plt.imshow(cqt_spect, aspect=2, origin='lower', cmap=plt.cm.spectral)
plt.title('5 bins per note, aspect=2')
yt = np.arange(0, cqt_spect.shape[0], 10)
_ = plt.yticks(yt, (yt/5+33).astype('int')) # range starts at midi pitch 33


plt.figure(figsize=(24,24))
plt.imshow(cqt_spect, aspect=4, origin='lower', cmap=plt.cm.spectral)
plt.title('5 bins per note, aspect=4')
yt = np.arange(0, cqt_spect.shape[0], 5)
_ = plt.yticks(yt, (yt/5+33).astype('int')) # range starts at midi pitch 33



In [128]:
cqt_spect = cqt(samples, sr=sr, hop_length=1024, fmin=lowfreq, n_bins=n_bins*5, scale=False, bins_per_octave=60)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,24))
plt.imshow(cqt_spect, aspect=2, origin='lower', cmap=plt.cm.spectral)
plt.title('5 bins per note, aspect=2, hopsize=1024')
yt = np.arange(0, cqt_spect.shape[0], 10)
_ = plt.yticks(yt, (yt/5+33).astype('int')) # range starts at midi pitch 33



In [123]:
cqt_spect = cqt(samples, sr=sr, hop_length=512, fmin=lowfreq, n_bins=n_bins*9, scale=False, bins_per_octave=108)
cqt_spect = np.abs(cqt_spect)  # convert from complex to real (uses "norm")

plt.figure(figsize=(24,6))
plt.imshow(cqt_spect, aspect=2, origin='lower', cmap=plt.cm.spectral)
plt.title('9 bins per note, aspect=2')
yt = np.arange(0, cqt_spect.shape[0], 18)
_ = plt.yticks(yt, (yt/9+33).astype('int')) # range starts at midi pitch 33


plt.figure(figsize=(24,30))
plt.imshow(cqt_spect, aspect=8, origin='lower', cmap=plt.cm.spectral)
plt.title('9 bins per note, aspect=8')
yt = np.arange(0, cqt_spect.shape[0], 9)
_ = plt.yticks(yt, (yt/9+33).astype('int')) # range starts at midi pitch 33


Fazit

Mit zunehmenden bins per note nimmt die Auflösung in der Zeitdomäne eher ab. Um ebenfalls 3 verschiedene Spektrogramme mit möglichst vielen Informationen zu bieten wählren wir folgende 3 Varianten:

  • 1 bin per note mit hop-size 256, für eine möglichst gute Auflösung in der Zeit
  • 3 bins per note mit hop-size 512, für einen möglichst guten Trade-off der beiden Parameter
  • 5 bins per note mit hop-size 1024, für eine möglichst gute Notenunterscheidung

In [ ]: