In [ ]:
import tensorflow as tf
In [ ]:
filenames = [ './librivox/guidetomen_%02d_rowland_64kb.mp3' % (i,) for i in [1,2,3]]
filenames
For each file in turn, pull out blocks of spectra each 1024 spectra long (~12sec). First 64 of these will be discarded, so 'step increment' should be (1024-64=960) Ignore tail block.
https://www.tensorflow.org/programmers_guide/datasets
http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
https://indico.io/blog/tensorflow-data-inputs-part1-placeholders-protobufs-queues/
In [ ]:
import numpy as np
import tensorflow as tf
import librosa
librosa.__version__ # '0.5.1'
In [ ]:
sample_rate= 24000 # input will be standardised to this rate
fft_step = 12.5/1000. # 12.5ms
fft_window = 50.0/1000. # 50ms
n_fft = 512*4
hop_length = int(fft_step*sample_rate)
win_length = int(fft_window*sample_rate)
n_mels = 80
fmin = 125 # Hz
#fmax = ~8000
#np.exp(-7.0), np.log(spectra_abs_min) # "Audio tests" suggest a min log of -4.605 (-6 confirmed fine)
spectra_abs_min = 0.01 # From Google paper, seems justified
win_length, hop_length
In [ ]:
# And for the training windowing :
steps_total, steps_leadin = 1024, 64
In [ ]:
# Test the flatten idea
#a = np.array([3.4, 55.4, 34.23])
a = np.array([[3.4, 55.4,],[34.23, 342.1221]])
a.flatten().tolist()
#np.angle( 0.-1.j)
#np.angle([1.0, 1.0j, 1+1j, -1.0j])
# Test angle re-range idea
#_sa = 4.0
#np.where(_sa<=np.pi, _sa, _sa-2*np.pi)
In [ ]:
# Based on http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
#def _int64_feature(value):
# return tf.train.Feature(int64_list=tf.train.Int64List( value=[value] ))
def _floats_feature(np_arr):
return tf.train.Feature(float_list=tf.train.FloatList( value=np_arr.flatten().tolist() ))
def convert_wavs_to_spectra_learnable_records(filename_in):
print("convert_wavs_to_spectra_learnable_records(%s)" % (filename_in,))
filename_base = filename_in.replace('.mp3', '_%s.tfrecords')
samples, _sample_rate = librosa.core.load(filename_in, sr=sample_rate)
samples = samples/np.max(samples) # Force amplitude of waveform into range ~-1 ... +1.0
spectra_complex = librosa.stft(samples, n_fft=n_fft,
hop_length=hop_length,
win_length=win_length, window='hann', )
power_spectra = np.abs(spectra_complex)**2
melspectra = librosa.feature.melspectrogram(S=power_spectra, n_mels=n_mels, fmin=fmin)
with tf.python_io.TFRecordWriter(filename_base % ('train',)) as writer_train, \
tf.python_io.TFRecordWriter(filename_base % ('valid',)) as writer_valid, \
tf.python_io.TFRecordWriter(filename_base % ('test',)) as writer_test :
# Ok, now create a series of Examples with these features
for offset in range(0, melspectra.shape[1]-steps_total, steps_total-steps_leadin):
mel_offset = melspectra[:, offset:offset+steps_total]
spectra_offset = spectra_complex[:, offset:offset+steps_total]
# Now do some useful precalculation, instead of relying on TF later
mel_log = np.log( np.maximum(spectra_abs_min, mel_offset ))
spectra_l_amp = np.log( np.maximum(spectra_abs_min, np.abs(spectra_offset) ))
#_sa = np.angle(spectra_offset) # These are [0 .. 2pi) - convert to (-pi .. +pi]
#spectra_angles = np.where(_sa<=np.pi, _sa, _sa-2*np.pi)
# See : https://github.com/numpy/numpy/issues/7426 (documentation is wrong)
spectra_angles = np.angle(spectra_offset) # These are (-pi .. +pi]
spectra_angles = np.where(spectra_l_amp<-4.0, 0.0, spectra_angles)
spectra_phase0 = spectra_angles[:, steps_leadin-1:steps_leadin] # Pre-initial angles
spectra_pshift = spectra_angles[:, 1:] - spectra_angles[:, :-1] # Differences
# The spectra_pshift differences should be in (-pi ... pi)
x = spectra_pshift
x = np.where(x<=np.pi, x, x-2*np.pi)
x = np.where(x>-np.pi, x, x+2*np.pi)
spectra_pshift = x
spectra_target = np.concatenate( (
spectra_l_amp[:, steps_leadin:],
spectra_pshift[:, (steps_leadin-1):], # This shifted along by due to differencing
), axis=0) # 1025+1025
#print(mel_offset.shape, spectra_phase0.shape, spectra_pshift.shape, spectra_target.shape)
example = tf.train.Example(features=tf.train.Features(feature={
# NB: Tensorflow wants the transposed versions to match (batch, T, channels)
#'mel': _floats_feature( mel_offset.T ),
#'spectra_real': _floats_feature( spectra_offset.real.T ),
#'spectra_imag': _floats_feature( spectra_offset.imag.T ),
# These are the ones required below...
'mel_log': _floats_feature( mel_log.T ),
'spectra_phase0': _floats_feature( spectra_phase0.T ),
'spectra_target': _floats_feature( spectra_target.T ),
#'height': _int64_feature(height),
#'mask_raw': _bytes_feature(annotation_raw)
}))
w = writer_train # Allocate these between the various train/validation/test files
if np.random.random()>0.8:
w = writer_valid
if np.random.random()>0.5:
w = writer_test
w.write(example.SerializeToString())
In [ ]:
for f in filenames:
convert_wavs_to_spectra_learnable_records(f)
print("DONE!")
In [ ]:
mel_bins, spectra_bins = n_mels, n_fft//2+1 # 80, 1025
batch_size, num_epochs = 8, 10
In [ ]:
def mel_to_complex_dataset_from_mp3(filenames, stub='train'):
dataset = tf.data.TFRecordDataset([f.replace('.mp3', '_%s.tfrecords') % stub
for f in filenames])
spectra_len = (steps_total-steps_leadin)
features = {
#"mel": tf.FixedLenFeature([mel_bins*steps_total], tf.float32),
#"spectra_real": tf.FixedLenFeature([spectra_bins*steps_total], tf.float32),
#"spectra_imag": tf.FixedLenFeature([spectra_bins*steps_total], tf.float32),
"mel_log": tf.FixedLenFeature([steps_total*mel_bins], tf.float32),
"spectra_phase0": tf.FixedLenFeature([1*spectra_bins], tf.float32),
"spectra_target": tf.FixedLenFeature([spectra_len*spectra_bins*2], tf.float32),
}
def _parse_function_OLD(example_proto):
parsed_features = tf.parse_single_example(example_proto, features)
mel = tf.reshape( parsed_features["mel"], (mel_bins, steps_total) )
spectra_real = tf.reshape( parsed_features["spectra_real"], (spectra_bins, steps_total) )
spectra_imag = tf.reshape( parsed_features["spectra_imag"], (spectra_bins, steps_total) )
spectra_real = tf.transpose( spectra_real )
spectra_imag = tf.transpose( spectra_imag )
spectra_complex = tf.complex(spectra_real, spectra_imag)
#spectra_amp = tf.norm( spectra_complex )
spectra_amp = tf.sqrt( tf.square(spectra_real) + tf.square(spectra_imag) )
spectra_log_amp = tf.log( tf.maximum(0.00001, spectra_amp) )
spectra_phase = tf.angle(spectra_complex)
#return dict(MelInput=tf.transpose(mel)), (tf.transpose(spectra_log_amp), tf.transpose(spectra_phase))
spectra_concat = tf.concat( [spectra_log_amp, spectra_phase ], axis=1 )
return dict(MelInput=tf.transpose(mel)), spectra_concat
#return dict(MelInput=tf.transpose(mel)), (tf.transpose(spectra_real), tf.transpose(spectra_imag))
#spectra = tf.stack( [spectra_real, spectra_imag, ] ) # Should be (spectra_bins, steps_total, 2)
#return dict(MelInput=tf.transpose(mel)), tf.transpose(spectra, perm=[1,0,2])
def _parse_function(example_proto):
parsed_features = tf.parse_single_example(example_proto, features)
mel_log = tf.reshape( parsed_features["mel_log"], (steps_total, mel_bins) )
spectra_phase0 = tf.reshape( parsed_features["spectra_phase0"], (1, spectra_bins) )
spectra_target = tf.reshape( parsed_features["spectra_target"], (spectra_len, spectra_bins*2) )
return dict(MelInput=mel_log, Phase0=spectra_phase0), spectra_target
dataset = dataset.map(_parse_function)
return dataset
def input_fn_from(filenames, stub='train', batch_size=1, shuffle=False, repeats=1):
dataset = mel_to_complex_dataset_from_mp3(filenames, stub=stub)
if shuffle:
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(batch_size).repeat(repeats)
iterator = dataset.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
Keras-WaveNet :
TF-WaveNet :
In [ ]:
# See : https://github.com/tensorflow/tensorflow/issues/14933
# to understand how broken Google is
from tensorflow.python import keras
from tensorflow.python.keras import backend as K
#tf.reset_default_graph()
# Use 'real keras' to get the actual documented functionality for padding='causal'
#import keras
#from keras import backend as K
def wavenet_layer(channels, hidden_channels, kernel_size, dilation_rate, name):
def f(input_):
filter_out = keras.layers.Conv1D(hidden_channels, kernel_size,
strides=1, dilation_rate=dilation_rate,
padding='valid', use_bias=True,
activation='tanh', name='filter_'+name)(input_)
gate_out = keras.layers.Conv1D(hidden_channels, kernel_size,
strides=1, dilation_rate=dilation_rate,
padding='valid', use_bias=True,
activation='sigmoid', name='gate_'+name)(input_)
mult = keras.layers.Multiply(name='mult_'+name)( [filter_out, gate_out] )
# Need to pad this result back out to input_ size...
#print(dilation_rate, kernel_size, dilation_rate*(kernel_size-1))
#def original_shape(input_shape):
# return (input_shape[0], input_shape[1]+dilation_rate*kernel_size-2, input_shape[2])
#
#mult_padded = keras.layers.Lambda(
# lambda x: K.temporal_padding(x, padding=(dilation_rate*kernel_size-1,0) ),
# #output_shape=original_shape,
# name='mult_padded_'+name)(mult)
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/ZeroPadding1D
mult_padded = keras.layers.ZeroPadding1D( (dilation_rate*(kernel_size-1), 0) )(mult)
transformed = keras.layers.Conv1D(channels, 1,
padding='same', use_bias=True,
activation='linear', name='trans_'+name)(mult_padded)
skip_out = keras.layers.Conv1D(channels, 1,
padding='same', use_bias=True,
activation='relu', name='skip_'+name)(mult_padded)
return keras.layers.Add(name='resid_'+name)( [transformed, input_] ), skip_out
return f
#log_amplitude_with_minimum = keras.layers.Lambda( lambda x: K.log( K.maximum(0.00001, x) ))
io_channels, hidden_channels = 128,128
def model_mel_to_spec( input_shape=(steps_total, mel_bins) ):
#mel = keras.layers.Input(shape=input_shape, name='MelInput')
#mel = tf.keras.layers.Input(shape=input_shape, name='MelInput')
#mel = tf.keras.layers.Input(batch_size=batch_size, shape=input_shape, name='MelInput')
#mel = keras.layers.Input(batch_size=batch_size, shape=input_shape, name='MelInput')
#mel = keras.layers.Input(shape=input_shape, name='MelInput',
# _batch_input_shape = (batch_size, steps_total, mel_bins))
#mel = keras.layers.Input(batch_shape=(batch_size, steps_total, mel_bins), name='MelInput')
#mel._batch_input_shape = (batch_size, steps_total, mel_bins)
#mel = keras.layers.InputLayer(input_shape=input_shape, name='MelInput')
#mel_floored = K.maximum(0.00001, mel)
#mel_log = K.log(mel_floored) # This is (batch, T. channels)
#mel_log = log_amplitude_with_minimum(mel)
mel_log = keras.layers.Input(shape=input_shape, name='MelInput')
phase0 = keras.layers.Input(shape=input_shape, name='Phase0') # Unused
x = keras.layers.BatchNormalization()(mel_log)
# 'Resize' to make everything 'io_channels' big at the layer interfaces
x = s0 = keras.layers.Conv1D(io_channels, 1,
padding='same', use_bias=True,
activation='linear', name='mel_log_expanded')(x)
x,s1 = wavenet_layer(io_channels, hidden_channels*1, 3, 1, '1')(x)
x,s2 = wavenet_layer(io_channels, hidden_channels*1, 3, 2, '2')(x)
x,s3 = wavenet_layer(io_channels, hidden_channels*1, 3, 4, '3')(x)
x,s4 = wavenet_layer(io_channels, hidden_channels*1, 3, 8, '4')(x)
_,s5 = wavenet_layer(io_channels, hidden_channels*1, 3,16, '5')(x) # Total footprint is ~64 0.75secs
# x is now irrelevant
#skip_overall = keras.layers.Concatenate( axis=-1 )( [s0,s1,s2,s3,s4,s5] )
skip_overall = keras.layers.Concatenate( axis=-1 )( [s0,s1] )
log_amp = keras.layers.Conv1D(spectra_bins, 1, padding='same',
activation='linear', name='log_amp')(skip_overall)
phase_shift = keras.layers.Conv1D(spectra_bins, 1, padding='same',
activation='linear', name='phase_shift')(skip_overall)
#return keras.models.Model(inputs=[mel], outputs=[log_amp, phase])
#amp = K.exp(log_amp)
#amp = keras.layers.Lambda( lambda x: K.exp(x), name='amp')(log_amp)
#return keras.models.Model(inputs=[mel], outputs=[log_amp, phase])
#spec_real = keras.layers.Multiply()( [amp, K.cos(phase)] )
#spec_imag = keras.layers.Multiply()( [amp, K.sin(phase)] )
#return keras.models.Model(inputs=mel, outputs=[spec_real, spec_imag])
log_amp_valid = keras.layers.Cropping1D( (steps_leadin,0), name='crop_a' )( log_amp )
phase_shift_valid = keras.layers.Cropping1D( (steps_leadin,0), name='crop_p' )( phase_shift )
# Concat the amps and phases into one return value
spec_concat = keras.layers.Concatenate( axis=-1, name='spec_concat')(
[log_amp_valid, phase_shift_valid] )
return keras.models.Model(inputs=[mel_log, phase0], outputs=spec_concat)
keras_model = model_mel_to_spec()
keras_model.summary()
In [ ]:
def customLoss(spec_gold, spec_out):
gold_l_amp = keras.layers.Lambda(lambda x : x[:,:,:spectra_bins])(spec_gold)
gold_phase = keras.layers.Lambda(lambda x : x[:,:,spectra_bins:])(spec_gold)
spec_l_amp = keras.layers.Lambda(lambda x : x[:,:,:spectra_bins])(spec_out)
spec_phase = keras.layers.Lambda(lambda x : x[:,:,spectra_bins:])(spec_out)
l_amp_loss = keras.losses.mean_squared_error( gold_l_amp, spec_l_amp )
#phase_diff = K.abs( gold_phase - spec_phase ) # gold are (-pi ... pi), spec are anywhere
#phase_diff = K.minimum(phase_diff, 2.0*np.pi - phase_diff) # Positive by construction
#phase_loss = K.mean( phase_diff )
#phase_loss = K.mean( -K.cos( gold_phase - spec_phase ) ) # This 'fades out' near equality
phase_loss = keras.losses.mean_squared_error( gold_phase, spec_phase )
return l_amp_loss + 1.0 * phase_loss
keras_model.compile(loss=customLoss,
#optimizer=keras.optimizers.RMSprop(), # lr=2e-5
optimizer=keras.optimizers.Adam(), # lr=2e-5
metrics=['mse'])
In [ ]:
import os
model_dir = os.path.join(os.getcwd(), 'models', 'mel-to-complex-spectra_14-laptop')
os.makedirs(model_dir, exist_ok=True)
print("model_dir: ",model_dir)
estimator = tf.keras.estimator.model_to_estimator(keras_model=keras_model, model_dir=model_dir)
In [ ]:
# Just check that the input name for our model matches what we have in the DataSet reader
input_name = keras_model.input_names[0]
input_name
In [ ]:
train_spec = tf.estimator.TrainSpec(
#input_fn=lambda: imgs_input_fn(path_tfrecords_train, perform_shuffle=True,
# repeat_count=5, batch_size=20),
input_fn=lambda: input_fn_from(filenames, stub='train', shuffle=True,
repeats=num_epochs, batch_size=batch_size),
)
#max_steps=500)
eval_spec = tf.estimator.EvalSpec(
#input_fn=lambda: imgs_input_fn(path_tfrecords_test, perform_shuffle=False, batch_size=1)
input_fn=lambda: input_fn_from(filenames, stub='valid', shuffle=False,
repeats=1, batch_size=1),
)
import time
start_time = time.time()
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
print("--- %s seconds ---" % (time.time() - start_time))
In [ ]:
predict_results = estimator.predict(
input_fn=lambda: input_fn_from(filenames, stub='test', shuffle=False,
repeats=1, batch_size=1),
)
predictions = [p for p in predict_results]
len(predictions), predictions[0].keys()
In [ ]:
import matplotlib.pyplot as plt
from IPython.display import Audio as audio_playback_widget
import soundfile # For audio_widget
In [ ]:
def show_single_prediction(pred, phase0=None, stub='orig'):
# These are (probably) a numpy array(s)
#print(pred.shape, phase0.shape)
spec_l_amp = pred[:, :spectra_bins]
spec_phase_diff = pred[:, spectra_bins:]
#print(spec_phase_diff.mean(), spec_phase_diff.std(), )
#spec_l_amp = np.maximum(-6., spec_l_amp)
if phase0 is None:
phase0 = np.zeros( (1, spectra_bins) )
#spec_phase_diff += 0.5*np.random.normal( size=spec_phase_diff.shape )
# Find cumulative phase amounts
spec_phase = np.cumsum( spec_phase_diff*1.0, axis=0 )
# How sensitive is this to noise?
#spec_phase += 1.0*np.random.normal( size=spec_phase.shape )
#spec_mean = np.mean(spec_phase, axis=1)[ :, np.newaxis] # mean of each timestep taken off
#spec_mean = 1.0*np.random.normal( size=(spec_phase.shape[0],1) ) # Jitter every timestep
#spec_mean = 1.0*np.random.normal( size=(1,spec_phase.shape[1]) ) # Jitter each spectrum freq
#spec_phase = spec_phase - spec_mean
#print(spec_mean.shape, np.max(spec_mean))
spec_complex = np.exp(spec_l_amp + 1j*(phase0+spec_phase*1.0)).T
#spec_complex = np.exp(spec_l_amp + 1j*0.00001*(phase0+spec_phase)).T
spectrum = np.log( np.absolute( spec_complex ) )
fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(20,4))
cax = ax.matshow(spectrum, interpolation='nearest', aspect='auto', cmap=plt.cm.afmhot, origin='lower')
#cax = ax.matshow(spec_phase.T, interpolation='nearest', aspect='auto', cmap=plt.cm.afmhot, origin='lower')
fig.colorbar(cax)
plt.title("Predicted Spectrogram")
plt.show()
#plt.figure(figsize=(10, 4))
#librosa.display.specshow(melout[:, 0:400], y_axis='mel', fmin=125, x_axis='time')
#plt.colorbar(format='%+2.0f dB')
#plt.title('Mel spectrogram')
#plt.show()
samples = librosa.istft(spec_complex, hop_length=hop_length, win_length=win_length)
f = './tmp/single_%s.wav' % (stub,)
soundfile.write(f, samples/np.max(samples), samplerate=sample_rate)
return audio_playback_widget(f)
In [ ]:
ds_test_direct = mel_to_complex_dataset_from_mp3(filenames, stub='test')
ds_test_iterator = ds_test_direct.make_one_shot_iterator()
ds_test_iterator_next = ds_test_iterator.get_next()
with tf.Session() as sess:
first_batch = sess.run(ds_test_iterator_next)
spec_orig = first_batch[1] # This is the concat( l_amp and phase )
spec_l_amp = spec_orig[:, :spectra_bins]
spec_phase_diff = spec_orig[:, spectra_bins:]
#np.mean(spec_l_amp[:, 100]), np.std(spec_l_amp[:, 100]), np.min(spec_l_amp[:, 100]),
In [ ]:
spec_concat_true = first_batch[1].copy()
show_single_prediction( spec_concat_true, first_batch[0]['Phase0'] )
#show_single_prediction( spec_concat_true, None )
In [ ]:
spec_concat_pred = predictions[0]['spec_concat'].copy()
#show_single_prediction( spec_concat_pred, None, 'net' )
# Extreme demo : zero out the phases (to understand their importance)
#spec_concat_pred[:, spectra_bins:] = 0.
# Add in original phases (but keep predicted spectra amplitudes)
#spec_concat_pred[:, spectra_bins:] = spec_concat_true[:, spectra_bins:]
show_single_prediction( spec_concat_pred, first_batch[0]['Phase0'], 'net' )
#show_single_prediction( spec_concat_pred, None, 'net' )
In [ ]:
np.set_printoptions(precision=4,suppress=True)
(spec_concat_true[50:60, spectra_bins+40:spectra_bins+45],
spec_concat_pred[50:60, spectra_bins+40:spectra_bins+45], )
In [ ]: