In [ ]:
# Mapping between Sound & Word labels
# using LSTM
# TensorFlow 1.0.1 w/ Python 3.5
# 2017-04-08
#
# Input: Sound frames (26 dimensional MFCC+delta input); e.g. 'ta', 'da' utterances
# Output: Phone labels (4 categories: t, d, a, sil)
#
# **Prerequisite**
# - Install 'python_speech_features' package
#   from https://github.com/jameslyons/python_speech_features
# - Install 'textgrid' package
#   from https://github.com/kylebgorman/textgrid

from pylab import*
from scipy.io import wavfile
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import textgrid
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

In [ ]:
# Read sound
n_examples = 1
srate, sig = wavfile.read('da_ta_long.wav')
print('srate (Hz):', srate)
print('duration (sec):', len(sig)/srate)
plt.plot(np.arange(len(sig))/srate, sig)
plt.title('da_ta_long.wav')
plt.xlabel('Time (sec)')
plt.ylabel('Amplitude')
plt.show()

winlen = 0.025
winstep = 0.01
numcep = 13
mfcc_raw = mfcc(sig, srate, winlen, winstep, numcep,
                appendEnergy = True) # 13-d MFCC
mfcc_deriv1 = delta(mfcc_raw, N = 2) # 1st deriv
mfccs = np.concatenate((mfcc_raw, mfcc_deriv1), axis=1).astype(np.float32)
# mfccs = mfcc_raw
plt.imshow(np.rot90(mfccs, axes=(0,1)), aspect='auto')
plt.title('MFCC values (26 dimension)')
plt.xlabel('Time (msec)')
plt.ylabel('Coefficients')
plt.show()

print('Input dimension:',mfccs.shape)

In [ ]:
# Read textgrid
T = textgrid.TextGrid()
T.read('da_ta_long.TextGrid')
w_tier = T.getFirst('Phone').intervals
time_mark = winlen/2 + winstep*np.arange(0, mfccs.shape[0])
time_mark = time_mark.astype('float32')

words_raw = []
for t in time_mark:
    for ival in range(len(w_tier)):
        if t > w_tier[ival].bounds()[0] and t <= w_tier[ival].bounds()[1]:
            words_raw.append(w_tier[ival].mark)

words_list = list(set(words_raw)) # unique word list
words_idx = {w: i for i, w in enumerate(words_list)}
words_data = [words_idx[w] for w in words_raw]
words_data_onehot = tf.one_hot(words_data,
                              depth = len(words_list),
                              on_value = 1.,
                              off_value = 0.,
                              axis = 1,
                              dtype=tf.float32)
with tf.Session() as sess: # convert from Tensor to numpy array
    words_label = words_data_onehot.eval()
print('words_list:',words_list)
print('output dimension:',words_label.shape)

In [ ]:
# Hyper-Parameters
learning_rate = 0.01
max_iter = 500

# Network Parameters
n_input_dim = mfccs.shape[1]
n_input_len = mfccs.shape[0]
n_output_dim = words_label.shape[1]
n_output_len = words_label.shape[0]
n_hidden = 300

# TensorFlow graph
# (batch_size) x (time_step) x (input_dimension)
x_data = tf.placeholder(tf.float32, [1, None, n_input_dim])
# (batch_size) x (time_step) x (output_dimension)
y_data = tf.placeholder(tf.float32, [1, None, n_output_dim])

# Parameters
weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, n_output_dim]))
}
biases = {
    'out': tf.Variable(tf.random_normal([n_output_dim]))
}

In [ ]:
def RNN(x, weights, biases):
    cell = tf.contrib.rnn.BasicLSTMCell(n_hidden, forget_bias=1.0) # Make RNNCell
    outputs, states = tf.nn.dynamic_rnn(cell, x, time_major=False, dtype=tf.float32)
    '''
    **Notes on tf.nn.dynamic_rnn**

    - 'x' can have shape (batch)x(time)x(input_dim), if time_major=False or 
                         (time)x(batch)x(input_dim), if time_major=True
    - 'outputs' can have the same shape as 'x'
                         (batch)x(time)x(input_dim), if time_major=False or 
                         (time)x(batch)x(input_dim), if time_major=True
    - 'states' is the final state, determined by batch and hidden_dim
    '''
    
    # outputs[-1] is outputs for the last example in the mini-batch
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

pred = RNN(x_data, weights, biases)
cost = tf.reduce_mean(tf.squared_difference(pred, y_data))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

In [ ]:
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    n'step = 1
    while step <= max_iter:
        loss = 0
        for i in range(n_examples):
            key = 's' + str(i + 1)
            x_train = mfccs.reshape((1, mfccs.shape[0], n_input_dim))
            y_train = words_label.reshape((1, words_label.shape[0], n_output_dim))
            c, _ = sess.run([cost, optimizer], feed_dict={x_data: x_train, y_data: y_train})
            loss += c
        mean_mse = loss / n_examples

        print('Epoch =', str(step), '/', str(max_iter),
              'Cost = ', '{:.5f}'.format(mean_mse))
        step += 1
        pred_out = sess.run(pred, feed_dict={x_data: x_train})
        pred_out = np.argmax(pred_out, 1)

        plt.subplot(211)
        plt.plot(words_data)
        plt.yticks([0, 1, 2, 3], words_list)
        plt.subplot(212)
        plt.plot(pred_out)
        plt.yticks([0, 1, 2, 3], words_list)
        plt.show()

In [ ]:
# Test
with tf.Session() as sess:
    sess.run(init)
    pred_out = sess.run(pred, feed_dict={x_data: x_train})
    pred_out = np.argmax(pred_out, 1)
    
    plt.subplot(211)
    plt.plot(words_data)
    plt.yticks([0, 1, 2, 3], words_list)
    plt.subplot(212)
    plt.plot(pred_out)
    plt.yticks([0, 1, 2, 3], words_list)
    plt.show()