Front-end / Features

Dataset description

Free Spoken Digit Dataset (FSDD)

  • 500 recordings in total (50 per digit)
  • 8kHz sampling rate
  • 1 speaker
  • English
  • File format -> {digit_label}_{speaker_name}_{index}.wav
    e.g. "4_jackson_16.wav"

Details: https://github.com/Jakobovski/free-spoken-digit-dataset


In [ ]:
import glob
import os
import librosa, librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram

%matplotlib inline

In [ ]:
# Matplotlib styling
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'

Extract sound filenames


In [ ]:
# Dataset folder 
SOUND_FOLDER = "../data/digit-dataset/"

# Create a list of all sound file paths
sound_paths = [SOUND_FOLDER + f for f in os.listdir(SOUND_FOLDER) if f[-4:] == '.wav']

Data visualization


In [ ]:
##################################################
#           Auxiliary functions                  #
##################################################

def load_sound_files(file_paths):
    return [librosa.load(fp)[0] for fp in file_paths]

def plot_wave(sound_name_with_raw_data):
    i = 1
    plt.figure(figsize=(15, 2 * len(sound_name_with_raw_data) if len(sound_name_with_raw_data) > 1 else 4))
    for n,d in sound_name_with_raw_data:
        plt.subplot(np.ceil(float(len(sound_name_with_raw_data))/2), 2, i)
        
        # wave plot
        librosa.display.waveplot(np.array(d),sr=8000)
        
        plt.ylabel('Amplitude')
        plt.title(n)
        i += 1
    plt.subplots_adjust(top=0.8, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
    plt.show()
    
def plot_spectrogram(sound_name_with_raw_data):
    i = 1
    plt.figure(figsize=(15, 2 * len(sound_name_with_raw_data) if len(sound_name_with_raw_data) > 1 else 4))
    for n,d in sound_name_with_raw_data:
        plt.subplot(np.ceil(float(len(sound_name_with_raw_data))/2), 2, i)
        
        # Spectrogram
        specgram(np.array(d), Fs=8000, NFFT=512, noverlap=248, scale="dB", vmax=20)
        
        plt.title(n)
        i += 1
    plt.subplots_adjust(top=0.8, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
    plt.show()

def plot_log_power_spectrogram(sound_name_with_raw_data):
    i = 1
    plt.figure(figsize=(15, 2 * len(sound_name_with_raw_data) if len(sound_name_with_raw_data) > 1 else 4))
    for n,d in sound_name_with_raw_data:
        plt.subplot(np.ceil(float(len(sound_name_with_raw_data))/2), 2, i)
        
        # Log power spectrogram
        D = librosa.logamplitude(np.abs(librosa.stft(d))**2, ref_power=np.max)
        librosa.display.specshow(D,x_axis='time' ,y_axis='log')
        
        plt.title(n)
        i += 1
    plt.subplots_adjust(top=0.8, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
    plt.show()

Load sound files used in visualization


In [ ]:
# Target sound filenames for visualization
sound_filenames = [str(i) + '_jackson_0.wav' for i in range(0, 10)]

# Load sound files used in visualization
sound_name_with_raw_data = [("Digit " + os.path.basename(p)[0], librosa.load(p)[0]) for i, p in enumerate(sound_paths) if os.path.basename(p) in sound_filenames]

In [ ]:
specgram?

Plots

Fig. I. Speech Pressure Wave plot


In [ ]:
plot_wave(sound_name_with_raw_data)

Fig. II. Spectrogram


In [ ]:
plot_spectrogram(sound_name_with_raw_data)

Fig. III. Log Power Spectrogram


In [ ]:
plot_log_power_spectrogram(sound_name_with_raw_data)

Feature extraction


In [ ]:
##################################################
#           Auxiliary functions                  #
##################################################

def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz

def get_features_and_labels(sound_paths):
    features, labels = np.empty((0,193)), np.empty(0)
    for p in sound_paths:
        mfccs, chroma, mel, contrast,tonnetz = extract_feature(p)
        ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
        features = np.vstack([features,ext_features])
        labels = np.append(labels, int(os.path.basename(p)[0]))
    return np.array(features), np.array(labels, dtype = np.int)

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

Extract feature vectors and prepare labels


In [ ]:
features, labels = get_features_and_labels(sound_paths)
labels = one_hot_encode(labels)

Save on disk


In [ ]:
FEATURE_PATH = 'front-end/features.npy'
LABEL_PATH = 'front-end/labels.npy'

In [ ]:
np.save(FEATURE_PATH, features)
np.save(LABEL_PATH, labels)