In [ ]:


In [1]:
#  Compatibility imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import time

import tensorflow as tf
import scipy.io.wavfile as wav
import numpy as np

from six.moves import xrange as range

try:
    from tensorflow.python.ops import ctc_ops
except ImportError:
    from tensorflow.contrib.ctc import ctc_ops

try:
    from python_speech_features import mfcc
except ImportError:
    print("Failed to import python_speech_features.\n Try pip install python_speech_features.")
    raise ImportError
import char_map
import csv

In [2]:
def sparse_tuple_from(sequences, dtype=np.int32):
    """Create a sparse representention of x.
    Args:
        sequences: a list of lists of type dtype where each element is a sequence
    Returns:
        A tuple with (indices, values, shape)
    """
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n]*len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)
    #print(indices)
    return indices, values, shape

def pad_sequences(sequences, maxlen=None, dtype=np.float32,
                  padding='post', truncating='post', value=0.):
    '''Pads each sequence to the same length: the length of the longest
    sequence.
        If maxlen is provided, any sequence longer than maxlen is truncated to
        maxlen. Truncation happens off either the beginning or the end
        (default) of the sequence. Supports post-padding (default) and
        pre-padding.
        Args:
            sequences: list of lists where each element is a sequence
            maxlen: int, maximum length
            dtype: type to cast the resulting sequence.
            padding: 'pre' or 'post', pad either before or after each sequence.
            truncating: 'pre' or 'post', remove values from sequences larger
            than maxlen either in the beginning or in the end of the sequence
            value: float, value to pad the sequences to the desired value.
        Returns
            x: numpy array with dimensions (number_of_sequences, maxlen)
            lengths: numpy array with the original sequence lengths
    '''
    lengths = np.asarray([len(s) for s in sequences], dtype=np.int64)

    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x, lengths

def read_data_fake(num_examples, num_features, num_labels, min_size = 10, max_size=100):

    # Generating different timesteps for each fake data
    timesteps = np.random.randint(min_size, max_size, (num_examples,))
   
    
    # Generating random input
    inputs = np.asarray([np.random.randn(t, num_features).astype(np.float32) for t in timesteps])
#     print(inputs[0])
    print(np.shape(inputs[1]))
    print(np.shape(inputs[2]))
    

    # Generating random label, the size must be less or equal than timestep in order to achieve the end of the lattice in max timestep
    labels = np.asarray([np.random.randint(2, num_labels, np.random.randint(1, inputs[i].shape[0], (1,))).astype(np.int64) for i, _ in enumerate(timesteps)])
#     print(labels)
#     print(np.shape(labels[1]))
#     print(np.shape(labels[2]))
    
    files = np.asarray([])
    
    return files, inputs, labels



def extract_inputs(files):
    batch_inputs = []
    for i, file_name in enumerate(files):
        fs, audio = wav.read(file_name)
        mfcc_values = mfcc(audio, samplerate=fs)
        batch_inputs.append(np.asarray(mfcc_values))
        
#         train_inputs = np.asarray(mfcc_values[np.newaxis, :])
#         train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
#         batch_inputs.append(train_inputs[0])

        #print(file_name)

    return batch_inputs

In [3]:
def read_data(file_name, num_examples, num_features, num_labels, min_size = 10, max_size=100):
    labels_arr = []
    inputs_arr=[]
    files_arr=[]
    with open(file_name) as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        sortedCSV =sorted(readCSV, key=lambda row: float(row[0]), reverse=False)
        index = 0
        for row in sortedCSV:
            label_list=list(row[1])
            #print (row[1])
            files_arr.append(row[2])
            label_transformed=[char_map.char_to_int(v) for v in label_list]
            label_transformed = np.asarray(label_transformed)
            #print (label_transformed)
            labels_arr.append(label_transformed)
            #print("[read_data] shape: {}".format(np.shape(label_transformed)))
            index+=1
            if(num_examples!=None and index>=num_examples):
                break;
        labels=np.asarray(labels_arr)
        inputs=np.asarray(inputs_arr)
        files=np.asarray(files_arr)
    return files, inputs, labels

# Constants

# Some configs
num_features = 13
# Accounting the 0th indice +  space + blank label = 28 characters
num_classes = len(char_map.char_map_lt)+2#ord('z') - ord('a') + 1 + 1 + 1


# Hyper-parameters
num_epochs = 4
num_hidden = 50
num_layers = 1
batch_size = 16
initial_learning_rate = 1e-2
momentum = 0.9
train_desc_file='/data/liepa_train.csv'
#let's use training to make it works at all 
val_desc_file='/data/liepa_train.csv'#'/data/liepa_validation.csv'

train_files, train_inputs, train_targets = read_data(train_desc_file, 1560, num_features, num_classes - 1)
validate_files, validate_inputs, validate_targets = read_data(val_desc_file, 50, num_features, num_classes - 1)
#pick last foour. Other throw away... not efficient
validate_targets=validate_targets[-4:]
validate_files=validate_files[-4:]
validate_inputs=validate_inputs[-4:]

num_examples = len(train_files)#16
print("num_examples: {}".format(num_examples))
num_batches_per_epoch = int(num_examples/batch_size)


num_examples: 1560

In [4]:
validate_targets


Out[4]:
array([array([ 1, 16, 18, 29, 22, 19,  1, 13,  3, 25, 26,  4,  1]),
       array([ 1, 26,  4, 26, 15, 29, 32, 15, 22, 15, 26,  1]),
       array([ 1, 22, 29, 23, 28, 25,  3, 29, 19, 23, 26,  1]),
       array([ 1, 28, 31, 19, 26, 28,  3, 22, 28, 15, 26,  1])], dtype=object)

In [5]:
graph = tf.Graph()
with graph.as_default():
    # e.g: log filter bank or MFCC features
    # Has size [batch_size, max_stepsize, num_features], but the
    # batch_size and max_stepsize can vary along each step
    inputs = tf.placeholder(tf.float32, [None, None, num_features],name="inputs")

    # Here we use sparse_placeholder that will generate a
    # SparseTensor required by ctc_loss op.
    targets = tf.sparse_placeholder(tf.int32,name="targets")

    # 1d array of size [batch_size]
    seq_len = tf.placeholder(tf.int32, [None],name="seq_len")

    # Defining the cell
    # Can be:
    #   tf.nn.rnn_cell.RNNCell
    #   tf.nn.rnn_cell.GRUCell
    cell = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)

    # Stacking rnn cells
    stack = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers,
                                        state_is_tuple=True)

    # The second output is the last state and we will no use that
    outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32)

    shape = tf.shape(inputs)
    batch_s, max_timesteps = shape[0], shape[1]

    # Reshaping to apply the same weights over the timesteps
    outputs = tf.reshape(outputs, [-1, num_hidden])

    # Truncated normal with mean 0 and stdev=0.1
    # Tip: Try another initialization
    # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
    W = tf.Variable(tf.truncated_normal([num_hidden,
                                         num_classes],
                                        stddev=0.1))
    # Zero initialization
    # Tip: Is tf.zeros_initializer the same?
    b = tf.Variable(tf.constant(0., shape=[num_classes]))

    # Doing the affine projection
    logits = tf.matmul(outputs, W) + b

    # Reshaping back to the original shape
    logits = tf.reshape(logits, [batch_s, -1, num_classes])

    # Time major
    logits = tf.transpose(logits, (1, 0, 2))

    loss = ctc_ops.ctc_loss(logits, targets, seq_len)
    cost = tf.reduce_mean(loss)

    optimizer = tf.train.MomentumOptimizer(initial_learning_rate,
                                           0.9).minimize(cost)

    # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
    decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len)

    # Inaccuracy: label error rate
    ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          targets))

In [6]:
with tf.Session(graph=graph) as session:
    # Initializate the weights and biases
    print("[global_variables_initializer]+")
    tf.global_variables_initializer().run()
    print("[global_variables_initializer]-")
    for curr_epoch in range(num_epochs):
        train_cost = train_ler = 0
        start = time.time()

        for batch in range(num_batches_per_epoch):
            # Getting the index
            indexes = [i % num_examples for i in range(batch * batch_size, (batch + 1) * batch_size)]
            
            batch_train_inputs = extract_inputs(train_files[indexes])

            # Padding input to max_time_step of this batch
            batch_train_inputs, batch_train_seq_len = pad_sequences(batch_train_inputs)
            if(indexes[0]%100==0):
                print(batch_train_seq_len)

            # Converting to sparse representation so as to to feed SparseTensor input
            batch_train_targets = sparse_tuple_from(train_targets[indexes])

            feed = {inputs: batch_train_inputs,
                    targets: batch_train_targets,
                    seq_len: batch_train_seq_len}

            batch_cost, _ = session.run([cost, optimizer], feed)
            train_cost += batch_cost*batch_size
            train_ler += session.run(ler, feed_dict=feed)*batch_size


        # Shuffle the data
        shuffled_indexes = np.random.permutation(num_examples)
        #train_inputs = train_inputs[shuffled_indexes]
        train_files = train_files[shuffled_indexes]
        train_targets = train_targets[shuffled_indexes]

        # Metrics mean
        train_cost /= num_examples
        train_ler /= num_examples

        log = "Epoch {}/{}, train_cost = {:.3f}, train_ler = {:.3f}, time = {:.3f}"
        print(log.format(curr_epoch+1, num_epochs, train_cost, train_ler, time.time() - start))
        
        
        
        
        
        
        
        
        
        
        
        
        
        # Decoding all at once. Note that this isn't the best way
    ##### 
    for i, iValidate_file in enumerate(validate_files):

        batch_validate_inputs = extract_inputs([iValidate_file])


        # Padding input to max_time_step of this batch
        batch_validate_inputs, batch_validate_seq_len = pad_sequences(batch_validate_inputs)

        # Converting to sparse representation so as to to feed SparseTensor input
        batch_validate_targets = sparse_tuple_from([validate_targets[i]])

        feed = {inputs: batch_validate_inputs,
                targets: batch_validate_targets,
                seq_len: batch_validate_seq_len
                }

        # Decoding
        d = session.run(decoded[0], feed_dict=feed)
        dense_decoded = tf.sparse_tensor_to_dense(d, default_value=-1).eval(session=session)

        for i, seq in enumerate(dense_decoded):

            seq = [s for s in seq if s != -1]

            print('Sequence %d' % i)
            org_val = "".join([char_map.int_to_char(v) for v in train_targets[i]])
            print('\t Original:\n[{}]'.format(org_val) )
            #decoded_val = "".join([char_map.int_to_char(v) for v in seq])
            print('\t Decoded:\n[{}]'.format(seq))


[global_variables_initializer]+
[global_variables_initializer]-
[125 140 140 140 140 140 140 140 140 140 140 140 140 140 140 140]
[148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148]
[154 154 154 154 154 154 154 154 154 154 154 154 154 154 154 154]
[159 159 159 160 160 160 160 160 160 160 160 160 160 160 160 160]
Epoch 1/4, train_cost = 48.912, train_ler = 0.808, time = 23.910
[163 154 154 156 157 155 146 162 149 163 155 162 152 142 159 153]
[144 150 157 148 161 163 152 151 153 161 164 141 153 163 146 147]
[155 148 144 142 162 147 143 145 155 154 150 154 157 161 147 154]
[163 146 150 143 154 152 150 153 156 162 157 141 163 160 145 149]
Epoch 2/4, train_cost = 36.908, train_ler = 0.822, time = 26.703
[158 161 149 153 163 145 145 155 158 160 151 146 159 145 151 145]
[158 145 154 152 149 154 159 159 151 157 143 155 157 151 153 147]
[146 157 163 147 147 160 158 146 159 144 163 150 155 161 160 163]
[147 160 144 146 140 148 155 150 158 154 154 156 152 155 149 156]
Epoch 3/4, train_cost = 34.842, train_ler = 0.830, time = 28.345
[154 150 161 154 149 142 163 157 159 160 152 164 152 158 147 163]
[162 156 161 159 155 151 163 145 153 155 152 150 161 157 143 148]
[140 144 159 145 155 162 157 150 155 163 151 156 144 150 154 163]
[144 145 147 149 155 152 163 154 150 152 151 163 155 143 158 141]
Epoch 4/4, train_cost = 33.110, train_ler = 0.835, time = 28.823
Sequence 0
	 Original:
[<SPACE>naujas<SPACE>dokumentas<SPACE>]
	 Decoded:
[[1, 19, 1]]
Sequence 0
	 Original:
[<SPACE>naujas<SPACE>dokumentas<SPACE>]
	 Decoded:
[[1, 1]]
Sequence 0
	 Original:
[<SPACE>naujas<SPACE>dokumentas<SPACE>]
	 Decoded:
[[1, 1]]
Sequence 0
	 Original:
[<SPACE>naujas<SPACE>dokumentas<SPACE>]
	 Decoded:
[[1, 1]]

In [ ]: