In [7]:

    
%pylab --no-import-all inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import features as F
import training_data
import extract;









    



Populating the interactive namespace from numpy and matplotlib



In [8]:

    
def spectral_comp(data, slicer=lambda x: x, NFFT=256):
    f, axes = plt.subplots(len(data), 1, figsize=(16, 3*len(data)), sharex=True)
    for ax, d in izip(axes.flat, data):
        ax.specgram(slicer(d), NFFT=NFFT)
    return f

def show_Q(Q, x, y):
    n = Q.shape[0]
    f, axes = plt.subplots(y, x, figsize=(2 * x, 2 * y), sharey=True, sharex=True)
    for i, a in enumerate(axes.flat):
        if i == n:
            break
        a.imshow(Q[i].T)
        a.axis('off')
    f.tight_layout()
    return f



In [9]:

    
eng3, fs = training_data.get_data(4, 'english')
eng_mfcc = F.mfcc_atomic(eng3, fs)
eng_grad = F.stack_double_deltas(eng_mfcc)
eng_filtered = F.low_energy_filter(eng_grad, 15)
tone_palette = F.norm_ordered_centroids(eng_filtered, k=40)









    



/usr/local/Cellar/python/2.7.5/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scipy/io/wavfile.py:42: WavFileWarning: Unknown wave file format
  warnings.warn("Unknown wave file format", WavFileWarning)



In [10]:

    
f, (a1, a2) = plt.subplots(2, 1, figsize=(14, 4))
a1.plot(eng3[5000:30000])
a2.specgram(eng3[4000:31000])
a1.set_title("1-Second of Speech")
a2.set_title("Spectrogram")
a1.get_xaxis().set_visible(False)
a2.get_xaxis().set_visible(False)
a1.get_yaxis().set_visible(False)
a2.get_yaxis().set_visible(False)
#.imshow(eng_grad[:50000].T)
#a3.imshow(eng_filtered[:50000]),
#a4.imshow(tone_palette.T)



In [14]:

    
f, a1 = plt.subplots(1, 1, figsize=(14, 4))
a1.imshow(np.log2(np.abs(tone_palette.T)))
a1.set_title("Full Stream Tone Palette of Single Speaker")









    Out[14]:





<matplotlib.text.Text at 0x11c115590>



In [5]:

    
languages = ['english', 'dutch', 'french', 'spanish', 'german']
data_sources = [
            list(training_data.gen_get_data(xrange(2, 4), lang, 22050))
            for lang in languages]

feature_gens = [extract.mfcc_centroids_from_raw(source, 13) 
                #for source in data_sources]









    



/usr/local/Cellar/python/2.7.5/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scipy/io/wavfile.py:42: WavFileWarning: Unknown wave file format
  warnings.warn("Unknown wave file format", WavFileWarning)



In [5]:

    
Qs = [extract.mfcc_collection_as_matrix(fgen) for fgen in feature_gens]



In [6]:

    
[Q.shape for Q in Qs]









    Out[6]:





[(9, 13, 13), (9, 13, 13), (9, 13, 13), (9, 13, 13), (9, 13, 13)]

The shape of things to come

Q is our comparison tensor. Q.shape = (n, k, mfcc_coeffs), where n is the number of training samples we have.

Premise: our features for speaker recognition are the mfcc centroids of the elicitation paragraph. Goal: Take a signal that is a combination (distorted possibly) of a bunch of signals, determine which are contributing to it. Its like a vocal-filter autoencoder ...



In [7]:

    
fs = [show_Q(Q, 3, 3) for Q in Qs];

Note that some Q matrices are brighter than others, my hypothesis is that those recordings have 'more speaking' (i.e. less pauses). The next charts will attempt to identify those recordings.



In [16]:

    
mfcc_stream = extract.mfcc(data_sources[0][0][0])



In [26]:

    
f, ax = plt.subplots(1, 1)
ax.errorbar(range(mfcc_stream[0].shape[1]), mfcc_stream[0].mean(0), yerr=mfcc_stream[0].std(0))
ax.set_xlim(-1, 13)









    Out[26]:





(-1, 13)



In [76]:

    
counts, starts, f = aml_hist(np.log(np.linalg.norm(mfcc_stream[0], axis=0)), bins=10)



In [34]:

    
import features as F



In [35]:

    
x = data_sources[0][0][0]
f, (a1, a2, a3) = plt.subplots(3, 1, figsize=(14, 6))
a2.specgram(x[:10000], Fs=22050, NFFT=256)
a1.plot(x[:10000])
a3.imshow(F.mfcc_centroid(x[:10000], 22050., 10.)[0].T)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-35-7be4d96fe8c9> in <module>()
      3 a2.specgram(x[:10000], Fs=22050, NFFT=256)
      4 a1.plot(x[:10000])
----> 5 a3.imshow(F.mfcc_centroid(x[:10000], 22050., 10.)[0].T)

/usr/local/Cellar/python/2.7.5/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/matplotlib/axes.pyc in imshow(self, X, cmap, norm, aspect, interpolation, alpha, vmin, vmax, origin, extent, shape, filternorm, filterrad, imlim, resample, url, **kwargs)
   7298                        filterrad=filterrad, resample=resample, **kwargs)
   7299 
-> 7300         im.set_data(X)
   7301         im.set_alpha(alpha)
   7302         self._set_artist_props(im)

/usr/local/Cellar/python/2.7.5/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/matplotlib/image.pyc in set_data(self, A)
    427         if (self._A.ndim not in (2, 3) or
    428             (self._A.ndim == 3 and self._A.shape[-1] not in (3, 4))):
--> 429             raise TypeError("Invalid dimensions for image data")
    430 
    431         self._imcache = None

TypeError: Invalid dimensions for image data



In [85]:

    
reload(F)









    Out[85]:





<module 'features' from 'features.py'>



In [95]:

    
f, a = plt.subplots(1, 4, figsize=(14, 5))
qs = [5., 10., 20., 30.]
for (ax, q) in zip(a.flat, qs):
    c = F.mfcc_centroid(x, fs=22050, 
                        mfcc_kwargs={'nwin': 256, 'nceps': 13, 
                                     'nfft': 512, 'drop':1},
                        filter_percentile=q)
    ax.imshow(c)



In [91]:

    
plt.imshow(F.mfcc_atomic(x[:10000], 22050. ,drop=1).T);



In [94]:

    
plt.imshow(F.low_energy_filter(F.mfcc_atomic(x[:10000], 22050., drop=1), 15).T)









    Out[94]:





<matplotlib.image.AxesImage at 0x116dc7d10>



In [79]:

    
q = np.array([[1, 2],[2, 3]])[:, 1:]



In [80]:

    
q









    Out[80]:





array([[2],
       [3]])



In [ ]: