this is a notebook for speech siamese. the goal is to add siamese network after the speech command network to make a one-shot speech command model. with this model, take two piece of audio as input, the model will tell if it is the same speech command or not. if the accuracy is good enough, we make take it input product for voice trigger or voice command which are useful for all kind of product.
the trick may be if siamese can make one shot accure enough.
In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import hashlib
import math
import os.path
import random
import re
import sys
import tarfile
import numpy as np
import librosa as rosa
from six.moves import urllib
from six.moves import xrange # pylint: disable=redefined-builtin
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout, Flatten, Lambda
#from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
#from tensorflow.python.ops import io_ops
#from tensorflow.python.platform import gfile
#from tensorflow.python.util import compat
number_mfcc=128
sample_rate=16000
#for mac
#speech_data_dir="/Users/hermitwang/Downloads/speech_dataset"
#for ubuntu
speech_data_dir="/home/hermitwang/TrainingData/datasets/speech_dataset"
Here is another implementation of one-shot learning of keyword trigger with librosa mfcc. librosa cannot put into tensorflow graph. so mfcc computation will be before conv network. that means load_wav_mfcc has to convert all wav file to mfcc vector. Here i have to understand 1, what is the good mfcc vector dimension. 20, 127 may not be the right input for conv network. 2, even the mfcc output of librosa is not the same as tensorflow contrib.decode wav, it is enough if it has all audio feature. put librosa mfcc output as input of conv net, it will do good learning about feature abstraction. 3, conv net may not be that difficult. just like conv2d -> maxpooling -> conv2d->flatten->dense with softmax. 4, build the train network with librosa and conv net. 5, take the dense vector output as feature extractor. 6, build siamese network with the feature extractor. 7, may add couples of dense layer to learn the feature mapping and comparation of siamese. 8, if that works, we get an one-shot learning for key word trigger... 9, in reality, we still have to work out how to split the audio stream into audio clip as the input the librosa mfcc.
extract MFCC from wav file what is the wav parameter for MFCC output
tensorflow speech command parameter {'desired_samples': 16000, 'window_size_samples': 480, 'window_stride_samples': 160, 'spectrogram_length': 98, 'fingerprint_width': 40, 'fingerprint_size': 3920, 'label_count': 12, 'sample_rate': 16000, 'preprocess': 'mfcc', 'average_window_width': -1}
Mel-frequency cepstral coefficients (MFCCs)
Parameters:
y:np.ndarray [shape=(n,)] or None
audio time series
sr:number > 0 [scalar]
sampling rate of y
S:np.ndarray [shape=(d, t)] or None
log-power Mel spectrogram
n_mfcc: int > 0 [scalar]
number of MFCCs to return
Returns:
M:np.ndarray [shape=(n_mfcc, t)]
MFCC sequence
need more study about MFCC output
In [10]:
def load_wav_mfcc(filename):
wav_loader, sample_rate = rosa.load(filename, sr=16000)
#print(rosa.get_duration(wav_loader, sample_rate))
wav_mfcc = rosa.feature.mfcc(y=wav_loader, sr=16000, n_mfcc=128)
return wav_mfcc
Wav file loader and export mfcc sequence.
0, go throught all wav file to add background voice into command wav file 1, go through all wav file and convert to MFCC sequence 2, construct pair of MFCC sequence and a target (0 or 1, 0 for different command, 1 for the same command) the same word 1000, random generate key index, the first index of wav, and the second index of wav. the diff word 1000, random generae two key index, the first index of wav, and the second index of wav. the format will be [mfcc 1, mfcc 2, 0/1 for the same or different] 3, prepare pair of MFCC and targets according to batch size.
In [12]:
class WavMFCCLoader(object):
def __init__(self, data_dir, wanted, validation_percentage=0, testing_percentage=0):
self.data_dir = data_dir
self.wanted = wanted
self.wav_files = dict()
self.wav_file_index()
def wav_file_index(self):
for dirpath, dirnames, files in os.walk(self.data_dir):
for name in files:
if name.lower().endswith('.wav'):
word_name = dirpath.rsplit('/', 1)[1];
if word_name in self.wanted:
file_name = os.path.join(dirpath, name)
#print(file_name, dirpath, word_name)
if word_name in self.wav_files.keys():
self.wav_files[word_name].append(file_name)
else:
self.wav_files[word_name] = [file_name]
return self.wav_files
def wavs_to_mfcc_pair(self):
how_many_words = len(self.wanted)
a_index = random.randint(0, how_many_words - 1)
b_index = random.randint(0, how_many_words - 1)
a_wav_index = b_wav_index = -1
mfcc_pair = np.array([3, 1])
if (a_index != b_index):
a_wav_index = random.randint(0, len(self.wav_files[self.wanted[a_index]]) - 1)
b_wav_index = random.randint(0, len(self.wav_files[self.wanted[b_index]]) - 1)
mfcc_1 = load_wav_mfcc(self.wav_files[self.wanted[a_index]][a_wav_index])
mfcc_2 = load_wav_mfcc(self.wav_files[self.wanted[b_index]][b_wav_index])
mfcc_pair = 0
else:
a_wav_index = random.randint(0, len(self.wav_files[self.wanted[a_index]]) - 1)
b_wav_index = random.randint(0, len(self.wav_files[self.wanted[a_index]]) - 1)
mfcc_1 = load_wav_mfcc(self.wav_files[self.wanted[a_index]][a_wav_index])
mfcc_2 = load_wav_mfcc(self.wav_files[self.wanted[a_index]][b_wav_index])
mfcc_pair = 1
#print("aaa", mfcc_1.shape, mfcc_2.shape)
return mfcc_1, mfcc_2, mfcc_pair
def get_mfcc_pairs(self, how_many):
mfcc1_data = np.zeros((how_many, 128, 32))
mfcc2_data = np.zeros((how_many, 128, 32))
same_data = np.zeros(how_many)
for i in range(0, how_many - 1):
mfcc1_data_, mfcc2_data_, same_data[i] = self.wavs_to_mfcc_pair()
mfcc1_data[i, :, 0:mfcc1_data_.shape[1]] = mfcc1_data_
mfcc2_data[i, :, 0:mfcc2_data_.shape[1]] = mfcc2_data_
#np.append(mfcc1_data, mfcc1_)
#np.append(mfcc2_data, mfcc2_)
#np.append(same_data, same_)
#print(mfcc_pairs)
return mfcc1_data, mfcc2_data, same_data
loader = WavMFCCLoader(speech_data_dir, wanted=["one", "two", "bed"])
#wav_list = loader.wav_file_index()
mfcc1_data, mfcc2_data, same_pair = loader.get_mfcc_pairs(100)
print(same_pair)
create a keras conv network, take mfcc vector as input.
the speech command mfcc input shape is (?, 98, 40, 1) the first filter shape is (20, 8, 1, 64)
In [33]:
def create_keras_model(fingerprint_shape, is_training=True):
model = Sequential()
model.add(Conv2D(input_shape=fingerprint_shape, filters=64, kernel_size=3, activation="relu"))
model.add(MaxPooling2D())
#if (is_training):
# model.add(Dropout(0.5))
model.add(Conv2D(filters=64, kernel_size=3, activation="relu"))
model.add(MaxPooling2D())
#if (is_training):
# model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(1024))
if (is_training):
model.add(Dropout(0.5))
#model.add(Dense(labels_count, activation="softmax"))
return model
def model_train(labels_count=10, epochs=10, batch_size=32):
x_train = np.random.random((1000, 98, 40, 1))
y_train = keras.utils.to_categorical(np.random.randint(labels_count, size=(1000, 1)), num_classes=labels_count)
x_test = np.random.random((100, 98, 40, 1))
y_test = keras.utils.to_categorical(np.random.randint(labels_count, size=(100, 1)), num_classes=labels_count)
keras_model = create_keras_model((98, 40, 1), True)
keras_model.add(Dense(labels_count, activation='softmax'))
keras_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
keras_model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
score = keras_model.evaluate(x_test, y_test, batch_size)
return score
In [38]:
def create_siamese_model(input_shape, siamese_model = 'concat'):
right_input = Input(input_shape)
left_input = Input(input_shape)
keras_model = create_keras_model(input_shape)
right_encoder = keras_model(right_input)
left_encoder = keras_model(left_input)
if (siamese_model == 'concat'):
concatenated_layer = keras.layers.concatenate([right_encoder, left_encoder])
elif (siamese_model == 'abs'):
concatenated_layer = Lambda(lambda x: tf.sqrt(tf.reduce_sum(tf.square(x[0]-x[1]), 2)), output_shape=lambda x: x[0])([right_encoder, left_encoder])
else:
raise ValueError("unknown siamese_model")
output_layer = Dense(1, activation='sigmoid')(concatenated_layer)
siamese_model = Model([right_input, left_input], output_layer)
return siamese_model
def siamese_train(siamese_model='abs'):
siamese_model = create_siamese_model((128,32,1), siamese_model="concat")
siamese_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
loader = WavMFCCLoader(speech_data_dir, wanted=["one", "two", "bed"])
mfcc1_data, mfcc2_data, pairs = loader.get_mfcc_pairs(1000)
x1_train = mfcc1_data.reshape((1000, 128, 32, 1)) #np.random.random((1000, 98, 40, 1))
x2_train = mfcc2_data.reshape((1000, 128, 32, 1)) #np.random.random((1000, 98, 40, 1))
y_train = pairs #keras.utils.to_categorical(pairs, num_classes=1)
siamese_model.fit([x1_train, x2_train], y_train, epochs=10, batch_size=32)
mfcc1_test, mfcc2_test, pairs_test = loader.get_mfcc_pairs(100)
x1_test = mfcc1_test.reshape((100, 128,32, 1))
x2_test = mfcc2_test.reshape((100, 128,32, 1))
y_test = pairs_test
loss, accuracy = siamese_model.evaluate([x1_test, x2_test], y_test)
print(loss)
return accuracy
Siamese Network
In [39]:
#wav_mfcc = load_wav_mfcc("/Users/hermitwang/Downloads/speech_dataset/backward/0a2b400e_nohash_0.wav")
#print(wav_mfcc.shape)
score = siamese_train()
print(score)
In [ ]: