In [ ]:
import os

import numpy as np
import matplotlib.pyplot as plt
import scipy.misc  # for image resizing

#import scipy.io.wavfile

# pip install soundfile librosa python_speech_features
import soundfile

from IPython.display import Audio as audio_playback_widget

In [ ]:
#f = './data/raw-from-phone.wav'  # un-normalized
f = './data/num_phone_en-UK_m_Martin14.wav'  # has been normed

In [ ]:
# Read in the original file
samples, sample_rate = soundfile.read(f)

samples = samples / np.max(samples)  # Norm the signal

def show_waveform(sound):
    n_samples = sound.shape[0]

    plt.figure(figsize=(12,2))
    plt.plot(np.arange(0.0, n_samples)/sample_rate, sound)
    plt.xticks( np.arange(0.0, n_samples/sample_rate, 0.5), rotation=90 )

    plt.grid(True)

    plt.show()

show_waveform(samples)
audio_playback_widget(f)

In [ ]:
n_fft = 512

fft_step   = 0.010 # 10ms
fft_window = 0.025 # 25ms

In [ ]:
import librosa

#hop_length=int(fft_step*sample_rate)   # number audio of frames between STFT columns
#win_length=int(fft_window*sample_rate) # number audio of frames between STFT columns

win_length=None # defaults to n_fft
hop_length=None # defaults to win_length/4

spectrum_complex = librosa.stft(samples, n_fft=n_fft, 
                       hop_length=hop_length, win_length=win_length,
                       window='hann', center=True, 
                       dtype=np.complex64) # This has real and imaginary parts each as float32

spectrum_complex.shape

In [ ]:
samples_defft = librosa.istft(spectrum_complex, 
                              hop_length=hop_length, win_length=win_length)

In [ ]:
def quick_view_and_play(stub, samples_here, show_waveform_graph=True):
    f = './tmp/%s.wav' % (stub,)
    soundfile.write(f, samples_here/np.max(samples_here), samplerate=sample_rate)
    if show_waveform_graph:
        show_waveform(samples_here)
    return audio_playback_widget(f)

In [ ]:
# This proves that audio->sfft(complex)->defft(complex)->audio is pretty much identical
quick_view_and_play('defft', samples_defft)

In [ ]:
spectrum_real = np.absolute(spectrum_complex)
# effectively, all phase information==0

samples_re_defft = librosa.istft(spectrum_real, hop_length=hop_length, win_length=win_length)

# This has a strange 'phasing effect' (as might be expected...)
quick_view_and_play('re-defft', samples_re_defft)

In [ ]:
def show_spectrum( spectrum, title='Spectrum' ):
    fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(20,4))

    cax = ax.matshow(spectrum, interpolation='nearest', aspect='auto', cmap=plt.cm.afmhot, origin='lower')
    fig.colorbar(cax)
    plt.title(title)
    plt.show()
    
show_spectrum( np.log(spectrum_real), title='Spectrum (Absolute value)' )

In [ ]:
def real_spectrum_to_samples( spectrum_re, iters=20 ):
    # Initially pick random phases
    phases = 2.0 * np.pi * np.random.random_sample(spectrum_re.shape) - np.pi
    #phases = np.zeros_like( spectrum_re )

    for i in range(iters):
        spectrum_complex_guess = spectrum_re * np.exp(1.j*phases)

        samples_reim = librosa.istft(spectrum_complex_guess, 
                                     hop_length=hop_length, win_length=win_length,
                                     window='hann', center=True, 
                                    )

        re_calc_fft = librosa.stft(samples_reim, n_fft=n_fft, 
                         hop_length=hop_length, win_length=win_length,
                         window='hann', center=True, 
                         dtype=np.complex64)

        phases_next = np.angle( re_calc_fft )  # What are the phases just reported?  Next iteration, use these
        #phases = (phases+phases_next)/2.0
        
        # Find the phase difference in the range (-pi, +pi)
        phases_diff = (phases_next - phases + np.pi) % (2 * np.pi ) - np.pi
        
        phases_clipped = np.clip( phases_diff, a_min=-np.pi/8.0, a_max=+np.pi/8.0)
        phases = phases + phases_clipped
        
        print( [ '%+.4f' % p for p in phases_clipped[30:40, int(5.3/fft_step)] ] )
        #print( np.abs(phases_diff).mean() )
        
    return samples_reim
    
samples_reim_defft = real_spectrum_to_samples( spectrum_real )

quick_view_and_play('reim-defft', samples_reim_defft)

In [ ]:
import python_speech_features
n_mel_freq_components = 64
#n_mel_freq_components = 256

# create_mel_filters
#mel_inversion_filter = python_speech_features.get_filterbanks(nfft=n_fft, samplerate=sample_rate,
#                                        nfilt=n_mel_freq_components,
#                                        lowfreq = 300.0, highfreq = 8000.0)
#                                        #lowfreq = 0, highfreq = None)
#
#mel_filter = mel_inversion_filter.T / mel_inversion_filter.sum(axis=1)
#mel_filter.shape

mel_filters_from_fft = python_speech_features.get_filterbanks(nfft=n_fft, samplerate=sample_rate,
                                        nfilt=n_mel_freq_components,
                                        #lowfreq = 10.0, highfreq = None)
                                        #lowfreq = 300.0, highfreq = 8000.0)
                                        #lowfreq = 10, highfreq = None)
                                                    )

#mel_filters_from_fft = np.identity( 257 ) # This is uncompressed : just as a check

mel_filters_from_fft.shape   # (64, 257)
mel_filters_from_fft[10:15,10:20]

In [ ]:
#mel_filters_from_fft[15,:]
#mel_filters_from_fft.sum(axis=0)  # Total attention paid to each of the FFT bins
mel_filters_from_fft.sum(axis=1)  # Amount of FFT contribution to each mel bin

# To share out the weight in a given mel bin, need to scale it down
mel_filters_to_fft = (mel_filters_from_fft / mel_filters_from_fft.sum(axis=1, keepdims=True)).T
#mel_filters_psuedoinverse

spectrum_identity = mel_filters_to_fft.dot(mel_filters_from_fft)  # Should be ~I(257,257)
#spectrum_identity[5:10, 5:10] # demonstrates identity and some smear at low frequencies
spectrum_identity[27:35, 27:35] # demonstrates 'smear' but not over/under emphasis

In [ ]:
def make_mel(spectrogram, mel_filter_from_fft, shorten_factor=1.0):
    #spectrogram = np.square(spectrogram)    # Convert to 'power' ?
    mel_spec = mel_filter_from_fft.dot( spectrogram )
    
    mel_spec = scipy.ndimage.zoom(mel_spec.astype('float32'), 
                                   [1, 1./shorten_factor],  # Shorten in time direction
                                   mode='nearest', order=1,
                                 ).astype('float32')
    #print(np.min(mel_spec), np.max(mel_spec), )
    
    #mel_spec_bounded = np.clip(mel_spec, a_min=np.exp(-15.0), a_max=None)
    return mel_spec #_bounded

def mel_to_spectrogram(mel_spec, mel_filter_to_fft, shorten_factor=1.0):
    """
    takes in an mel spectrogram and returns a normal spectrogram for inversion 
    """
    mel_spec_uncompressed = scipy.ndimage.zoom(mel_spec.astype('float32'), 
                                               #[shorten_factor,1], 
                                               [1, shorten_factor], 
                                               mode='nearest', order=1,
                                              ).astype('float32')

    spectrum_recovered = mel_filter_to_fft.dot( mel_spec_uncompressed )
    
    #uncompressed_spec = uncompressed_spec -4
    
    #spectrum_recovered = np.sqrt( spectrum_recovered )  # Convert from 'power'
    
    return np.clip(spectrum_recovered, a_min=np.exp(-12.0), a_max=None)

In [ ]:
shorten_factor=1.0  # This is a time-wise compression factor : 1.0 is none
#shorten_factor=4.0  # This is a time-wise compression factor : 4.0 seems harsh

mel_spec = make_mel(spectrum_real, mel_filters_from_fft, shorten_factor=shorten_factor)

#spectrum_real
mel_spec.shape

In [ ]:
# plot the compressed spec
show_spectrum( np.log(mel_spec), 'mel Spectrogram')

In [ ]:
spectrogram_from_mel = mel_to_spectrogram(mel_spec, mel_filters_to_fft, shorten_factor=shorten_factor)
#spectrogram_from_mel = spectrum_identity.dot(spectrum_real)

In [ ]:
# plot the recovered spec
show_spectrum( np.log(spectrogram_from_mel), 'Recovered Spectrogram')

In [ ]:
# And this is the original 'real' one, for comparison 
#show_spectrum( np.log( np.clip(spectrum_real, a_min=np.exp(-12.0), a_max=None)), title='Original Spectrum' )
show_spectrum( np.log( spectrum_real ), title='Original Spectrum' )

In [ ]:
samples_via_mel = real_spectrum_to_samples( spectrogram_from_mel, iters=20 )

quick_view_and_play('via-mel', samples_via_mel)

In [ ]:
#['+0.0027', '+0.0111', '+0.0081', '+0.0105', '+0.0106', '+0.0172', '+0.0040', '+0.0095', '+0.0237', '+0.0258']

Turn mp3 into mels


In [ ]:
import librosa
librosa.__version__  # '0.5.1'

In [ ]:
filename_in = './librivox/guidetomen_02_rowland_64kb.mp3'
filename_out = filename_in.replace('.mp3', '.mel')

#import audioread
#with audioread.audio_open(filename) as f:
#    print(f.channels, f.samplerate, f.duration)#print(os.listdir(f))

# Requires 'dnf install ffmpeg' for .mp3 decoding

#samples, sample_rate = librosa.core.load(filename_in, sr=None)
samples, sample_rate = librosa.core.load(filename_in, sr=24000)
samples = samples/np.max(samples)  # Force amplitude of waveform into range ~-1 ... +1.0

sample_rate, samples.shape, samples.shape[0]/sample_rate

In [ ]:
#quick_view_and_play('librivox-orig', samples) # Rather large computation for this graph...
audio_playback_widget(filename_in)

In [ ]:
fft_step   = 12.5/1000. # 12.5ms
fft_window = 50.0/1000.  # 50ms

n_fft = 512*4

hop_length = int(fft_step*sample_rate)
win_length = int(fft_window*sample_rate)

n_mels = 80
fmin = 125 # Hz
#fmax = ~8000

hop_length, win_length

In [ ]:
# https://librosa.github.io/librosa/generated/librosa.feature.melspectrogram.html
#S = librosa.feature.melspectrogram(y=samples, sr=sample_rate, n_mels=n_mels, fmin=fmin)

In [ ]:
spectra_complex = librosa.stft(samples, n_fft=n_fft, 
                       hop_length=hop_length, 
                       win_length=win_length, window='hann', )

power_spectra = np.abs(spectra_complex)**2
melspectra = librosa.feature.melspectrogram(S=power_spectra, n_mels=n_mels, fmin=fmin)

spectra_complex.shape, melspectra.shape

In [ ]:
#dir(librosa)
import librosa.display

In [ ]:
plt.figure(figsize=(10, 4))
librosa.display.specshow(librosa.power_to_db(melspectra[:,0:1024], ref=np.max), 
                         y_axis='mel', fmin=125, x_axis='time', 
                         hop_length=hop_length, sr=sample_rate)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.show()
# Section Two Of ... The Guide to Men ,,, This Librivox recording is in the Public Domain ...
# Bachelors ... Somehow ... Just at the psychological moment when a bachelor fancies he's going to die (etc)

In [ ]:
melspectra.min(), melspectra.max(), melspectra.mean(), melspectra.std()

In [ ]:
# ??  Is the following appropriate for us?
#melspectra_floored = np.maximum(0.01, melspectra)
melspectra_floored = np.maximum(0.001, melspectra)

melout = np.log(melspectra_floored)
melout.min(), melout.max(), melout.mean(), melout.std()

In [ ]:
plt.figure(figsize=(10, 4))
librosa.display.specshow(melout[:, 0:1024], y_axis='mel', fmin=125, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.show()

In [ ]:
import pickle

data_dictionary = dict(
    sample_rate=sample_rate, 
    hop_length=hop_length, win_length=win_length,
    fmin=fmin, 
    mel=melout,
)

pickle.dump(data_dictionary, open(filename_out, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
print("Created dataset : %s" % (filename_out, ))

In [ ]:
# Interesting, in that it allows vector-valued (x,y) with various penalty term choices  :
# https://librosa.github.io/librosa/generated/librosa.core.dtw.html#librosa-core-dtw

In [ ]:
# Do the inverse operation on spectra_complex... and listen to it
samples_defft = librosa.istft(spectra_complex, hop_length=hop_length, win_length=win_length)

In [ ]:
quick_view_and_play('librivox-recreated', samples_defft, show_waveform_graph=False)

In [ ]:
spectra_just_amplitudes = np.abs(spectra_complex) # + 0.j
samples_defft_real = librosa.istft(spectra_just_amplitudes, hop_length=hop_length, win_length=win_length)
quick_view_and_play('librivox-recreated-poorly', samples_defft_real, show_waveform_graph=False)

In [ ]:
import numpy as np
import tensorflow as tf

import librosa
librosa.__version__  # '0.5.1'

In [ ]:
sample_rate= 24000 # input will be standardised to this rate

fft_step   = 12.5/1000. # 12.5ms
fft_window = 50.0/1000.  # 50ms

n_fft = 512*4

hop_length = int(fft_step*sample_rate)
win_length = int(fft_window*sample_rate)

n_mels = 80
fmin = 125 # Hz
#fmax = ~8000

win_length, hop_length

In [ ]:
# And for the training windowing :
steps_total, steps_leadin = 1024, 64

# Test the flatten idea
#a = np.array([3.4, 55.4, 34.23])
a = np.array([[3.4, 55.4,],[34.23, 342.1221]])
a.flatten().tolist()

In [ ]:
# Based on http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List( value=[value] ))
def _floats_feature(np_arr):
    return tf.train.Feature(float_list=tf.train.FloatList( value=np_arr.flatten().tolist() ))

def convert_wavs_to_spectra_learnable_records(filename_in):
    filename_base = filename_in.replace('.mp3', '_%s.tfrecords')

    samples, _sample_rate = librosa.core.load(filename_in, sr=sample_rate)
    samples = samples/np.max(samples)  # Force amplitude of waveform into range ~-1 ... +1.0

    spectra_complex = librosa.stft(samples, n_fft=n_fft, 
                       hop_length=hop_length, 
                       win_length=win_length, window='hann', )

    power_spectra = np.abs(spectra_complex)**2
    melspectra = librosa.feature.melspectrogram(S=power_spectra, n_mels=n_mels, fmin=fmin)

    with tf.python_io.TFRecordWriter(filename_base % ('train',)) as writer_train, \
         tf.python_io.TFRecordWriter(filename_base % ('valid',)) as writer_valid, \
         tf.python_io.TFRecordWriter(filename_base % ('test',)) as writer_test :
                
        # Ok, now create a series of Examples with these features
        for offset in range(0, melspectra.shape[1]-steps_total, steps_total-steps_leadin):
            example = tf.train.Example(features=tf.train.Features(feature={
                #'height': _int64_feature(height),
                #'width': _int64_feature(width),
                #'image_raw': _bytes_feature(img_raw),
                #'mask_raw': _bytes_feature(annotation_raw)
                'mel': _floats_feature(melspectra[:, offset:offset+steps_total]),
                'spectra_real': _floats_feature( spectra_complex[:, offset:offset+steps_total].imag ),
                'spectra_imag': _floats_feature( spectra_complex[:, offset:offset+steps_total].imag ),
            }))

            w = writer_train  # Allocate these between the various train/validation/test files
            if np.random.random()>0.8:
                w = writer_valid
                if np.random.random()>0.5:
                    w = writer_test

            w.write(example.SerializeToString())

In [ ]:
for i in [1,2,3,]:
    convert_wavs_to_spectra_learnable_records('./librivox/guidetomen_%02d_rowland_64kb.mp3' % (i,))
print("DONE!")

In [ ]: