In [1]:
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import style
import librosa
import IPython.display
import librosa.display
import os
import random
from matplotlib.pyplot import specgram
In [2]:
random.seed(42)
In [3]:
style.use('ggplot')
%matplotlib inline
For our experiments, we use the Dataset for Environmental Sound Classification (ESC-50). "The ESC-50 dataset is a public labeled set of 2000 environmental recordings (50 classes, 40 clips per class, 5 seconds per clip) suitable for environmental sound classification tasks." You can download it from here: https://github.com/karoldvl/ESC-50
K. J. Piczak. ESC: Dataset for Environmental Sound Classification. In Proceedings of the 23rd ACM international conference on Multimedia, pp. 1015-1018, ACM, 2015.
In [4]:
data_dir = '../data/esc-50'
sample_dir = os.path.join(data_dir, 'sample')
We have chosen a small sample of the classes for demo purposes. Specifically, we have chosen sounds that have been tagged with one of the following four classes: insects, rooster, coughing, laughing.
In [5]:
print 'Sound Sample Classes'
print '--------------------'
for d in os.listdir(sample_dir):
print d
In [6]:
samples_dict = dict()
for d in os.listdir(sample_dir):
sample_class_dir = os.path.join(sample_dir, d)
samples_dict[d] = [os.path.join(sample_class_dir, f) for f in os.listdir(sample_class_dir)]
print 'Class \t Num Samples'
print '--------------------'
for k, v in samples_dict.iteritems():
print k, '\t', len(v)
Load the sound samples with a sampling rate of 44.1kHz, which is 44100 samples per second. When we load a .wav or .ogg file, we get a time series of amplitude values.
In [7]:
def load_file(file_path, sr=44100):
ts, sr = librosa.load(file_path, sr=sr)
return ts
In [8]:
def get_raw_sounds(file_paths, sr=44100):
return [load_file(fp, sr) for fp in file_paths]
In [9]:
def plot_waves(classes, raw_sounds):
idx = 1
fig = plt.figure(figsize=(12, 6))
for name, f in zip(classes, raw_sounds):
plt.subplot(len(classes), 1, idx)
librosa.display.waveplot(np.array(f), sr=44100)
plt.title(name.title())
idx += 1
plt.suptitle('Waveplot for Raw Audio', fontsize=18)
plt.show()
In [10]:
def plot_spectrograms(classes,raw_sounds, sr=44100):
idx = 1
fig = plt.figure(figsize=(12, 6))
for name, f in zip(classes, raw_sounds):
plt.subplot(len(classes), 1, idx)
specgram(np.array(f), Fs=22050)
plt.title(name.title())
idx += 1
plt.suptitle('Spectrogram', fontsize=18)
plt.show()
In [11]:
def plot_log_power_spectrograms(classes,raw_sounds):
idx = 1
fig = plt.figure(figsize=(12,6))
for name, f in zip(classes,raw_sounds):
plt.subplot(len(classes), 1, idx)
D = librosa.logamplitude(np.abs(librosa.stft(f))**2, ref_power=np.max)
librosa.display.specshow(D, x_axis='time' , y_axis='log')
plt.title(name.title())
idx += 1
plt.suptitle('Log power spectrogram', fontsize=18)
plt.show()
In [12]:
sample_insect = random.choice(samples_dict.get('insects'))
IPython.display.Audio(sample_insect, embed=True, autoplay=True)
Out[12]:
In [13]:
ts, sr = librosa.load(sample_insect)
librosa.display.waveplot(ts, sr=sr)
plt.suptitle('Wave Plot of an Insect sound', fontsize=18)
Out[13]:
In [14]:
sample_rooster = random.choice(samples_dict.get('rooster'))
IPython.display.Audio(sample_rooster, embed=True, autoplay=True)
Out[14]:
In [15]:
ts, sr = librosa.load(sample_rooster)
librosa.display.waveplot(ts, sr=sr)
plt.suptitle('Wave Plot of a Rooster sound', fontsize=18)
Out[15]:
In [16]:
sample_coughing = random.choice(samples_dict.get('coughing'))
IPython.display.Audio(sample_coughing, embed=True, autoplay=True)
Out[16]:
In [17]:
ts, sr = librosa.load(sample_coughing)
librosa.display.waveplot(ts, sr=sr)
plt.suptitle('Wave Plot of a Coughing sound', fontsize=18)
Out[17]:
In [18]:
sample_laughing = random.choice(samples_dict.get('laughing'))
IPython.display.Audio(sample_laughing, embed=True, autoplay=True)
Out[18]:
In [19]:
ts, sr = librosa.load(sample_laughing)
librosa.display.waveplot(ts, sr=sr)
plt.suptitle('Wave Plot of a Laughing sound', fontsize=18)
Out[19]:
In [20]:
sound_signals = get_raw_sounds([sample_insect, sample_rooster, sample_coughing, sample_laughing])
sound_names = ['insect', 'rooster', 'cough', 'laugh']
plot_waves(sound_names, sound_signals)
A spectrogram measures the signal strength or a how loud a signal is at various frequencies over a period of time. In essence, you are transforming a raw audio signal to a representation that shows a contour or a "heatmap" of energy levels over time.
The plot has time on the x-axis, frequency on the y-axis, and the amplitude values are sort of compressed and stacked to form a contour.
You can see the spectrograms for the above sample sounds.
In [21]:
plot_spectrograms(sound_names, sound_signals)
In [22]:
plot_log_power_spectrograms(sound_names, sound_signals)
You can also notice some similarities between sounds of the same class, and differences across sound classes.
In [23]:
sound_signals = get_raw_sounds(samples_dict.get('insects') + samples_dict.get('rooster'))
sound_names = ['insect', 'insect', 'rooster', 'rooster']
plot_waves(sound_names, sound_signals)
If we can model how sound is generated (by humans and other sources) and how we hear sound, we’ll get good features. For instance, speech generated by humans is filtered and modified by the shape of the vocal tract. If we can model this shape well, we can accurately represent phonemes. If we can model this for short durations over time, we can represent sequences of phonemes that are being produced.
In [58]:
ts, sr = librosa.load(sample_insect, sr=44100)
print 'Sampling Rate:', sr
print 'Number of Amplitude Samples:', len(ts)
print 'Duration of file:', len(ts) / sr, 'seconds'
print 'First 100 samples:\n', ts[:100]
In [132]:
frame_len = 20
num_frames = 5000 / frame_len
n = len(ts) / num_frames
insect_frames = [ts[i : i + n] for i in xrange(0, len(ts), n)]
In [139]:
librosa.display.waveplot(insect_frames[0])
Out[139]:
In [134]:
for i in range(5):
librosa.display.waveplot(insect_frames[i])
Similarly, the cochlea, which is an organ is the ear, vibrates at different locations depending on the frequency of the incoming signal. Based on the locations of the vibrations, the brain gets information about the frequencies present in the signal.
In MFCC, we do the following:
In [24]:
ts_insect, sr_insect = librosa.load(sample_insect, sr=41000)
# Create a Mel Spectrogram
spectrogram_insect = librosa.feature.melspectrogram(ts_insect, sr=sr_insect, n_mels=128)
# Convert to log scale (dB). We'll use the peak power as reference.
log_spectrogram_insect = librosa.logamplitude(spectrogram_insect, ref_power=np.max)
librosa.display.specshow(log_spectrogram_insect, sr=sr_insect, x_axis='time', y_axis='mel')
# Put a descriptive title on the plot
plt.title('Mel Power Spectrogram (Insect)')
# draw a color bar
plt.colorbar(format='%+02.0f dB')
# Make the figure layout compact
plt.tight_layout()
In [25]:
ts_rooster, sr_rooster = librosa.load(sample_rooster)
# Create a Mel Spectrogram
spectrogram_rooster = librosa.feature.melspectrogram(ts_rooster, sr=sr_rooster, n_mels=128)
# Convert to log scale (dB). We'll use the peak power as reference.
log_spectrogram_rooster = librosa.logamplitude(spectrogram_rooster, ref_power=np.max)
librosa.display.specshow(log_spectrogram_rooster, sr=sr_rooster, x_axis='time', y_axis='mel')
# Put a descriptive title on the plot
plt.title('Mel Power Spectrogram (Rooster)')
# draw a color bar
plt.colorbar(format='%+02.0f dB')
# Make the figure layout compact
plt.tight_layout()
In [26]:
# Extract MFCC features
mfcc = librosa.feature.mfcc(S=log_spectrogram_insect, n_mfcc=13)
# Let's pad on the first and second deltas while we're at it
delta_mfcc = librosa.feature.delta(mfcc)
delta2_mfcc = librosa.feature.delta(mfcc, order=2)
# How do they look? We'll show each in its own subplot
plt.figure(figsize=(12, 6))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc)
plt.ylabel('MFCC')
plt.colorbar()
plt.subplot(3,1,2)
librosa.display.specshow(delta_mfcc)
plt.ylabel('MFCC-$\Delta$')
plt.colorbar()
plt.subplot(3,1,3)
librosa.display.specshow(delta2_mfcc, sr=sr, x_axis='time')
plt.ylabel('MFCC-$\Delta^2$')
plt.colorbar()
plt.tight_layout()
In [27]:
# Extract MFCC features
mfcc = librosa.feature.mfcc(S=log_spectrogram_rooster, n_mfcc=13)
# Let's pad on the first and second deltas while we're at it
delta_mfcc = librosa.feature.delta(mfcc)
delta2_mfcc = librosa.feature.delta(mfcc, order=2)
# How do they look? We'll show each in its own subplot
plt.figure(figsize=(12, 6))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc)
plt.ylabel('MFCC')
plt.colorbar()
plt.subplot(3,1,2)
librosa.display.specshow(delta_mfcc)
plt.ylabel('MFCC-$\Delta$')
plt.colorbar()
plt.subplot(3,1,3)
librosa.display.specshow(delta2_mfcc, sr=sr, x_axis='time')
plt.ylabel('MFCC-$\Delta^2$')
plt.colorbar()
plt.tight_layout()
In [ ]: