In [ ]:
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy.misc # for image resizing
#import scipy.io.wavfile
# pip install soundfile librosa python_speech_features
import soundfile
from IPython.display import Audio as audio_playback_widget
In [ ]:
#f = './data/raw-from-phone.wav' # un-normalized
f = './data/num_phone_en-UK_m_Martin14.wav' # has been normed
In [ ]:
# Read in the original file
samples, sample_rate = soundfile.read(f)
samples = samples / np.max(samples) # Norm the signal
def show_waveform(sound):
n_samples = sound.shape[0]
plt.figure(figsize=(12,2))
plt.plot(np.arange(0.0, n_samples)/sample_rate, sound)
plt.xticks( np.arange(0.0, n_samples/sample_rate, 0.5), rotation=90 )
plt.grid(True)
plt.show()
show_waveform(samples)
audio_playback_widget(f)
In [ ]:
n_fft = 512
fft_step = 0.010 # 10ms
fft_window = 0.025 # 25ms
In [ ]:
import librosa
#hop_length=int(fft_step*sample_rate) # number audio of frames between STFT columns
#win_length=int(fft_window*sample_rate) # number audio of frames between STFT columns
win_length=None # defaults to n_fft
hop_length=None # defaults to win_length/4
spectrum_complex = librosa.stft(samples, n_fft=n_fft,
hop_length=hop_length, win_length=win_length,
window='hann', center=True,
dtype=np.complex64) # This has real and imaginary parts each as float32
spectrum_complex.shape
In [ ]:
samples_defft = librosa.istft(spectrum_complex,
hop_length=hop_length, win_length=win_length)
In [ ]:
def quick_view_and_play(stub, samples_here, show_waveform_graph=True):
f = './tmp/%s.wav' % (stub,)
soundfile.write(f, samples_here/np.max(samples_here), samplerate=sample_rate)
if show_waveform_graph:
show_waveform(samples_here)
return audio_playback_widget(f)
In [ ]:
# This proves that audio->sfft(complex)->defft(complex)->audio is pretty much identical
quick_view_and_play('defft', samples_defft)
In [ ]:
spectrum_real = np.absolute(spectrum_complex)
# effectively, all phase information==0
samples_re_defft = librosa.istft(spectrum_real, hop_length=hop_length, win_length=win_length)
# This has a strange 'phasing effect' (as might be expected...)
quick_view_and_play('re-defft', samples_re_defft)
In [ ]:
def show_spectrum( spectrum, title='Spectrum' ):
fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(20,4))
cax = ax.matshow(spectrum, interpolation='nearest', aspect='auto', cmap=plt.cm.afmhot, origin='lower')
fig.colorbar(cax)
plt.title(title)
plt.show()
show_spectrum( np.log(spectrum_real), title='Spectrum (Absolute value)' )
In [ ]:
def real_spectrum_to_samples( spectrum_re, iters=20 ):
# Initially pick random phases
phases = 2.0 * np.pi * np.random.random_sample(spectrum_re.shape) - np.pi
#phases = np.zeros_like( spectrum_re )
for i in range(iters):
spectrum_complex_guess = spectrum_re * np.exp(1.j*phases)
samples_reim = librosa.istft(spectrum_complex_guess,
hop_length=hop_length, win_length=win_length,
window='hann', center=True,
)
re_calc_fft = librosa.stft(samples_reim, n_fft=n_fft,
hop_length=hop_length, win_length=win_length,
window='hann', center=True,
dtype=np.complex64)
phases_next = np.angle( re_calc_fft ) # What are the phases just reported? Next iteration, use these
#phases = (phases+phases_next)/2.0
# Find the phase difference in the range (-pi, +pi)
phases_diff = (phases_next - phases + np.pi) % (2 * np.pi ) - np.pi
phases_clipped = np.clip( phases_diff, a_min=-np.pi/8.0, a_max=+np.pi/8.0)
phases = phases + phases_clipped
print( [ '%+.4f' % p for p in phases_clipped[30:40, int(5.3/fft_step)] ] )
#print( np.abs(phases_diff).mean() )
return samples_reim
samples_reim_defft = real_spectrum_to_samples( spectrum_real )
quick_view_and_play('reim-defft', samples_reim_defft)
In [ ]:
import python_speech_features
n_mel_freq_components = 64
#n_mel_freq_components = 256
# create_mel_filters
#mel_inversion_filter = python_speech_features.get_filterbanks(nfft=n_fft, samplerate=sample_rate,
# nfilt=n_mel_freq_components,
# lowfreq = 300.0, highfreq = 8000.0)
# #lowfreq = 0, highfreq = None)
#
#mel_filter = mel_inversion_filter.T / mel_inversion_filter.sum(axis=1)
#mel_filter.shape
mel_filters_from_fft = python_speech_features.get_filterbanks(nfft=n_fft, samplerate=sample_rate,
nfilt=n_mel_freq_components,
#lowfreq = 10.0, highfreq = None)
#lowfreq = 300.0, highfreq = 8000.0)
#lowfreq = 10, highfreq = None)
)
#mel_filters_from_fft = np.identity( 257 ) # This is uncompressed : just as a check
mel_filters_from_fft.shape # (64, 257)
mel_filters_from_fft[10:15,10:20]
In [ ]:
#mel_filters_from_fft[15,:]
#mel_filters_from_fft.sum(axis=0) # Total attention paid to each of the FFT bins
mel_filters_from_fft.sum(axis=1) # Amount of FFT contribution to each mel bin
# To share out the weight in a given mel bin, need to scale it down
mel_filters_to_fft = (mel_filters_from_fft / mel_filters_from_fft.sum(axis=1, keepdims=True)).T
#mel_filters_psuedoinverse
spectrum_identity = mel_filters_to_fft.dot(mel_filters_from_fft) # Should be ~I(257,257)
#spectrum_identity[5:10, 5:10] # demonstrates identity and some smear at low frequencies
spectrum_identity[27:35, 27:35] # demonstrates 'smear' but not over/under emphasis
In [ ]:
def make_mel(spectrogram, mel_filter_from_fft, shorten_factor=1.0):
#spectrogram = np.square(spectrogram) # Convert to 'power' ?
mel_spec = mel_filter_from_fft.dot( spectrogram )
mel_spec = scipy.ndimage.zoom(mel_spec.astype('float32'),
[1, 1./shorten_factor], # Shorten in time direction
mode='nearest', order=1,
).astype('float32')
#print(np.min(mel_spec), np.max(mel_spec), )
#mel_spec_bounded = np.clip(mel_spec, a_min=np.exp(-15.0), a_max=None)
return mel_spec #_bounded
def mel_to_spectrogram(mel_spec, mel_filter_to_fft, shorten_factor=1.0):
"""
takes in an mel spectrogram and returns a normal spectrogram for inversion
"""
mel_spec_uncompressed = scipy.ndimage.zoom(mel_spec.astype('float32'),
#[shorten_factor,1],
[1, shorten_factor],
mode='nearest', order=1,
).astype('float32')
spectrum_recovered = mel_filter_to_fft.dot( mel_spec_uncompressed )
#uncompressed_spec = uncompressed_spec -4
#spectrum_recovered = np.sqrt( spectrum_recovered ) # Convert from 'power'
return np.clip(spectrum_recovered, a_min=np.exp(-12.0), a_max=None)
In [ ]:
shorten_factor=1.0 # This is a time-wise compression factor : 1.0 is none
#shorten_factor=4.0 # This is a time-wise compression factor : 4.0 seems harsh
mel_spec = make_mel(spectrum_real, mel_filters_from_fft, shorten_factor=shorten_factor)
#spectrum_real
mel_spec.shape
In [ ]:
# plot the compressed spec
show_spectrum( np.log(mel_spec), 'mel Spectrogram')
In [ ]:
spectrogram_from_mel = mel_to_spectrogram(mel_spec, mel_filters_to_fft, shorten_factor=shorten_factor)
#spectrogram_from_mel = spectrum_identity.dot(spectrum_real)
In [ ]:
# plot the recovered spec
show_spectrum( np.log(spectrogram_from_mel), 'Recovered Spectrogram')
In [ ]:
# And this is the original 'real' one, for comparison
#show_spectrum( np.log( np.clip(spectrum_real, a_min=np.exp(-12.0), a_max=None)), title='Original Spectrum' )
show_spectrum( np.log( spectrum_real ), title='Original Spectrum' )
In [ ]:
samples_via_mel = real_spectrum_to_samples( spectrogram_from_mel, iters=20 )
quick_view_and_play('via-mel', samples_via_mel)
In [ ]:
#['+0.0027', '+0.0111', '+0.0081', '+0.0105', '+0.0106', '+0.0172', '+0.0040', '+0.0095', '+0.0237', '+0.0258']
In [ ]:
import librosa
librosa.__version__ # '0.5.1'
In [ ]:
filename_in = './librivox/guidetomen_02_rowland_64kb.mp3'
filename_out = filename_in.replace('.mp3', '.mel')
#import audioread
#with audioread.audio_open(filename) as f:
# print(f.channels, f.samplerate, f.duration)#print(os.listdir(f))
# Requires 'dnf install ffmpeg' for .mp3 decoding
#samples, sample_rate = librosa.core.load(filename_in, sr=None)
samples, sample_rate = librosa.core.load(filename_in, sr=24000)
samples = samples/np.max(samples) # Force amplitude of waveform into range ~-1 ... +1.0
sample_rate, samples.shape, samples.shape[0]/sample_rate
In [ ]:
#quick_view_and_play('librivox-orig', samples) # Rather large computation for this graph...
audio_playback_widget(filename_in)
In [ ]:
fft_step = 12.5/1000. # 12.5ms
fft_window = 50.0/1000. # 50ms
n_fft = 512*4
hop_length = int(fft_step*sample_rate)
win_length = int(fft_window*sample_rate)
n_mels = 80
fmin = 125 # Hz
#fmax = ~8000
hop_length, win_length
In [ ]:
# https://librosa.github.io/librosa/generated/librosa.feature.melspectrogram.html
#S = librosa.feature.melspectrogram(y=samples, sr=sample_rate, n_mels=n_mels, fmin=fmin)
In [ ]:
spectra_complex = librosa.stft(samples, n_fft=n_fft,
hop_length=hop_length,
win_length=win_length, window='hann', )
power_spectra = np.abs(spectra_complex)**2
melspectra = librosa.feature.melspectrogram(S=power_spectra, n_mels=n_mels, fmin=fmin)
spectra_complex.shape, melspectra.shape
In [ ]:
#dir(librosa)
import librosa.display
In [ ]:
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(melspectra[:,0:1024], ref=np.max),
y_axis='mel', fmin=125, x_axis='time',
hop_length=hop_length, sr=sample_rate)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.show()
# Section Two Of ... The Guide to Men ,,, This Librivox recording is in the Public Domain ...
# Bachelors ... Somehow ... Just at the psychological moment when a bachelor fancies he's going to die (etc)
In [ ]:
melspectra.min(), melspectra.max(), melspectra.mean(), melspectra.std()
In [ ]:
# ?? Is the following appropriate for us?
#melspectra_floored = np.maximum(0.01, melspectra)
melspectra_floored = np.maximum(0.001, melspectra)
melout = np.log(melspectra_floored)
melout.min(), melout.max(), melout.mean(), melout.std()
In [ ]:
plt.figure(figsize=(10, 4))
librosa.display.specshow(melout[:, 0:1024], y_axis='mel', fmin=125, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.show()
In [ ]:
import pickle
data_dictionary = dict(
sample_rate=sample_rate,
hop_length=hop_length, win_length=win_length,
fmin=fmin,
mel=melout,
)
pickle.dump(data_dictionary, open(filename_out, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
print("Created dataset : %s" % (filename_out, ))
In [ ]:
# Interesting, in that it allows vector-valued (x,y) with various penalty term choices :
# https://librosa.github.io/librosa/generated/librosa.core.dtw.html#librosa-core-dtw
In [ ]:
# Do the inverse operation on spectra_complex... and listen to it
samples_defft = librosa.istft(spectra_complex, hop_length=hop_length, win_length=win_length)
In [ ]:
quick_view_and_play('librivox-recreated', samples_defft, show_waveform_graph=False)
In [ ]:
spectra_just_amplitudes = np.abs(spectra_complex) # + 0.j
samples_defft_real = librosa.istft(spectra_just_amplitudes, hop_length=hop_length, win_length=win_length)
quick_view_and_play('librivox-recreated-poorly', samples_defft_real, show_waveform_graph=False)
For each file in turn, pull out blocks of spectra each 1024 spectra long (~12sec). First 64 of these will be discarded, so 'step increment' should be (1024-64=960) Ignore tail block.
https://www.tensorflow.org/programmers_guide/datasets
http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
https://indico.io/blog/tensorflow-data-inputs-part1-placeholders-protobufs-queues/
In [ ]:
import numpy as np
import tensorflow as tf
import librosa
librosa.__version__ # '0.5.1'
In [ ]:
sample_rate= 24000 # input will be standardised to this rate
fft_step = 12.5/1000. # 12.5ms
fft_window = 50.0/1000. # 50ms
n_fft = 512*4
hop_length = int(fft_step*sample_rate)
win_length = int(fft_window*sample_rate)
n_mels = 80
fmin = 125 # Hz
#fmax = ~8000
win_length, hop_length
In [ ]:
# And for the training windowing :
steps_total, steps_leadin = 1024, 64
# Test the flatten idea
#a = np.array([3.4, 55.4, 34.23])
a = np.array([[3.4, 55.4,],[34.23, 342.1221]])
a.flatten().tolist()
In [ ]:
# Based on http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List( value=[value] ))
def _floats_feature(np_arr):
return tf.train.Feature(float_list=tf.train.FloatList( value=np_arr.flatten().tolist() ))
def convert_wavs_to_spectra_learnable_records(filename_in):
filename_base = filename_in.replace('.mp3', '_%s.tfrecords')
samples, _sample_rate = librosa.core.load(filename_in, sr=sample_rate)
samples = samples/np.max(samples) # Force amplitude of waveform into range ~-1 ... +1.0
spectra_complex = librosa.stft(samples, n_fft=n_fft,
hop_length=hop_length,
win_length=win_length, window='hann', )
power_spectra = np.abs(spectra_complex)**2
melspectra = librosa.feature.melspectrogram(S=power_spectra, n_mels=n_mels, fmin=fmin)
with tf.python_io.TFRecordWriter(filename_base % ('train',)) as writer_train, \
tf.python_io.TFRecordWriter(filename_base % ('valid',)) as writer_valid, \
tf.python_io.TFRecordWriter(filename_base % ('test',)) as writer_test :
# Ok, now create a series of Examples with these features
for offset in range(0, melspectra.shape[1]-steps_total, steps_total-steps_leadin):
example = tf.train.Example(features=tf.train.Features(feature={
#'height': _int64_feature(height),
#'width': _int64_feature(width),
#'image_raw': _bytes_feature(img_raw),
#'mask_raw': _bytes_feature(annotation_raw)
'mel': _floats_feature(melspectra[:, offset:offset+steps_total]),
'spectra_real': _floats_feature( spectra_complex[:, offset:offset+steps_total].imag ),
'spectra_imag': _floats_feature( spectra_complex[:, offset:offset+steps_total].imag ),
}))
w = writer_train # Allocate these between the various train/validation/test files
if np.random.random()>0.8:
w = writer_valid
if np.random.random()>0.5:
w = writer_test
w.write(example.SerializeToString())
In [ ]:
for i in [1,2,3,]:
convert_wavs_to_spectra_learnable_records('./librivox/guidetomen_%02d_rowland_64kb.mp3' % (i,))
print("DONE!")
In [ ]: