In [1]:
import os
import sys
import flowfairy
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import IPython.display as display

In [2]:
sys.path.append('..')
os.environ.setdefault('GLUE_SETTINGS_MODULE', 'settings')


Out[2]:
'settings'

Load some data


In [3]:
folder = '/home/jobe/audio_dataset/preprocessed_22050/dev/'
all_data = os.listdir(folder)

def get(i):
    return np.load(os.path.join(folder, all_data[i]))

def audio(npz):    
    audio_file = str(npz['audio_file']).replace('//','/')
    sound, _ = sf.read(audio_file)
    return sound, npz['speaker_class']

Setup the network


In [4]:
import jobeNet2

shape = [1, None, 1]

X = tf.placeholder(tf.float32, shape=shape)
X_noise = tf.placeholder(tf.float32, shape=shape)
y = tf.placeholder(tf.int64, shape=shape)
cls = tf.placeholder(tf.int32, shape=[None])
outputlen = tf.placeholder(tf.int32)


net = jobeNet2.Net()

#with tf.device('/cpu:0'):
with tf.variable_scope('network') as scope:
    with tf.name_scope('train'):
        tx, tnet, *_ = net.feedforward(X, X_noise, y, cls, is_training=True, outputlen=outputlen)
    scope.reuse_variables()

    with tf.name_scope('validation'):
        vx, vnet, *_ = net.feedforward(X, X_noise, y, cls, outputlen=outputlen)


x Tensor("network/train/concat_2:0", shape=(1, ?, 2), dtype=float32)
conv1 Tensor("network/train/glu_conv1_1/mul:0", shape=(1, ?, 1, 4), dtype=float32)
conv1_d1  Tensor("network/train/glu_conv1_d1_2/mul:0", shape=(1, ?, 1, 8), dtype=float32)
conv1_d2  Tensor("network/train/glu_conv1_d2_2/mul:0", shape=(1, ?, ?, 8), dtype=float32)
conv1_d4  Tensor("network/train/glu_conv1_d4_2/mul:0", shape=(1, ?, ?, 8), dtype=float32)
conv1_concat Tensor("network/train/concat_4:0", shape=(1, ?, 1, 24), dtype=float32)
conv1:  Tensor("network/train/glu_conv1_3/mul:0", shape=(1, ?, 1, 16), dtype=float32)
conv2:  Tensor("network/train/glu_conv2_2/mul:0", shape=(1, ?, 1, 32), dtype=float32)
Tensor("network/train/embedding/strided_slice:0", shape=(), dtype=int32)
embedded: Tensor("network/train/embedding/concat:0", shape=(1, ?, 1, 36), dtype=float32)
conv3:  Tensor("network/train/glu_conv3_2/mul:0", shape=(1, ?, 1, 64), dtype=float32)
conv4:  Tensor("network/train/glu_conv4_4/mul:0", shape=(1, ?, 1, 128), dtype=float32)
conv5:  Tensor("network/train/conv5/lrelu/add:0", shape=(1, ?, 1, 256), dtype=float32)
out:  Tensor("network/train/Reshape:0", shape=(1, ?, 256), dtype=float32)
Tensor("network/train/uncertainty/sub:0", shape=(1, ?), dtype=float32)
Tensor("network/train/uncertainty/Mean:0", shape=(), dtype=float32)
x Tensor("network/validation/concat_2:0", shape=(1, ?, 2), dtype=float32)
conv1 Tensor("network/validation/glu_conv1_1/mul:0", shape=(1, ?, 1, 4), dtype=float32)
conv1_d1  Tensor("network/validation/glu_conv1_d1_2/mul:0", shape=(1, ?, 1, 8), dtype=float32)
conv1_d2  Tensor("network/validation/glu_conv1_d2_2/mul:0", shape=(1, ?, ?, 8), dtype=float32)
conv1_d4  Tensor("network/validation/glu_conv1_d4_2/mul:0", shape=(1, ?, ?, 8), dtype=float32)
conv1_concat Tensor("network/validation/concat_4:0", shape=(1, ?, 1, 24), dtype=float32)
conv1:  Tensor("network/validation/glu_conv1_3/mul:0", shape=(1, ?, 1, 16), dtype=float32)
conv2:  Tensor("network/validation/glu_conv2_2/mul:0", shape=(1, ?, 1, 32), dtype=float32)
Tensor("network/validation/embedding/strided_slice:0", shape=(), dtype=int32)
embedded: Tensor("network/validation/embedding/concat:0", shape=(1, ?, 1, 36), dtype=float32)
conv3:  Tensor("network/validation/glu_conv3_2/mul:0", shape=(1, ?, 1, 64), dtype=float32)
conv4:  Tensor("network/validation/glu_conv4_4/mul:0", shape=(1, ?, 1, 128), dtype=float32)
conv5:  Tensor("network/validation/conv5/lrelu/add:0", shape=(1, ?, 1, 256), dtype=float32)
out:  Tensor("network/validation/Reshape:0", shape=(1, ?, 256), dtype=float32)
Tensor("network/validation/uncertainty/sub:0", shape=(1, ?), dtype=float32)
Tensor("network/validation/uncertainty/Mean:0", shape=(), dtype=float32)

Start a session


In [5]:
session = tf.InteractiveSession()

Restore the saved network


In [6]:
saver = tf.train.Saver()
network_name = 'deepspeech4.checkpoint'

saver.restore(session, tf.train.latest_checkpoint('../logs', latest_filename=network_name))

Run the network on the data


In [7]:
def norm(tensor):
    tmin = tf.reduce_min(tensor)
    return tf.div((tensor - tmin), (tf.reduce_max(tensor) - tmin) + 1e-12)


pred = tf.argmax(tnet, 2)
vpred = tf.argmax(vnet, 2)
pred = norm(tf.cast(pred, tf.float32))
vpred = norm(tf.cast(vpred, tf.float32))

In [8]:
npz1 = get(156)
npz2 = get(50)
a1,c1 = audio(npz1)
a2,c2 = audio(npz2)
olen = min(a1.shape[0], a2.shape[0])
print('sec: ', olen/22050)
a1 = a1[None, :olen, None]
a2 = a2[None, :olen, None]

olen -= 22050 // 3400 + 10
# decrement outputlen until compatible with network
while olen % 4:
    olen -= 1


feed_dict = {X: a1, X_noise: a2, outputlen: olen, cls:[c1]}

noisy, prediction = session.run([vx, vpred], feed_dict=feed_dict)


sec:  3.625034013605442

In [9]:
print(prediction[0].shape)
print(noisy.shape)


(79916,)
(1, 79916, 2)

In [10]:
display.display(display.Audio(data=prediction[0], rate=22050))



In [11]:
display.display(display.Audio(data=noisy[:,:,0], rate=22050))



In [ ]: