In [7]:
%pylab --no-import-all inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import features as F
import training_data
import extract;
In [8]:
def spectral_comp(data, slicer=lambda x: x, NFFT=256):
f, axes = plt.subplots(len(data), 1, figsize=(16, 3*len(data)), sharex=True)
for ax, d in izip(axes.flat, data):
ax.specgram(slicer(d), NFFT=NFFT)
return f
def show_Q(Q, x, y):
n = Q.shape[0]
f, axes = plt.subplots(y, x, figsize=(2 * x, 2 * y), sharey=True, sharex=True)
for i, a in enumerate(axes.flat):
if i == n:
break
a.imshow(Q[i].T)
a.axis('off')
f.tight_layout()
return f
In [9]:
eng3, fs = training_data.get_data(4, 'english')
eng_mfcc = F.mfcc_atomic(eng3, fs)
eng_grad = F.stack_double_deltas(eng_mfcc)
eng_filtered = F.low_energy_filter(eng_grad, 15)
tone_palette = F.norm_ordered_centroids(eng_filtered, k=40)
In [10]:
f, (a1, a2) = plt.subplots(2, 1, figsize=(14, 4))
a1.plot(eng3[5000:30000])
a2.specgram(eng3[4000:31000])
a1.set_title("1-Second of Speech")
a2.set_title("Spectrogram")
a1.get_xaxis().set_visible(False)
a2.get_xaxis().set_visible(False)
a1.get_yaxis().set_visible(False)
a2.get_yaxis().set_visible(False)
#.imshow(eng_grad[:50000].T)
#a3.imshow(eng_filtered[:50000]),
#a4.imshow(tone_palette.T)
In [14]:
f, a1 = plt.subplots(1, 1, figsize=(14, 4))
a1.imshow(np.log2(np.abs(tone_palette.T)))
a1.set_title("Full Stream Tone Palette of Single Speaker")
Out[14]:
In [5]:
languages = ['english', 'dutch', 'french', 'spanish', 'german']
data_sources = [
list(training_data.gen_get_data(xrange(2, 4), lang, 22050))
for lang in languages]
feature_gens = [extract.mfcc_centroids_from_raw(source, 13)
#for source in data_sources]
In [5]:
Qs = [extract.mfcc_collection_as_matrix(fgen) for fgen in feature_gens]
In [6]:
[Q.shape for Q in Qs]
Out[6]:
Q is our comparison tensor. Q.shape = (n, k, mfcc_coeffs), where n is the number of training samples we have.
Premise: our features for speaker recognition are the mfcc centroids of the elicitation paragraph. Goal: Take a signal that is a combination (distorted possibly) of a bunch of signals, determine which are contributing to it. Its like a vocal-filter autoencoder ...
In [7]:
fs = [show_Q(Q, 3, 3) for Q in Qs];
Note that some Q matrices are brighter than others, my hypothesis is that those recordings have 'more speaking' (i.e. less pauses). The next charts will attempt to identify those recordings.
In [16]:
mfcc_stream = extract.mfcc(data_sources[0][0][0])
In [26]:
f, ax = plt.subplots(1, 1)
ax.errorbar(range(mfcc_stream[0].shape[1]), mfcc_stream[0].mean(0), yerr=mfcc_stream[0].std(0))
ax.set_xlim(-1, 13)
Out[26]:
In [76]:
counts, starts, f = aml_hist(np.log(np.linalg.norm(mfcc_stream[0], axis=0)), bins=10)
In [34]:
import features as F
In [35]:
x = data_sources[0][0][0]
f, (a1, a2, a3) = plt.subplots(3, 1, figsize=(14, 6))
a2.specgram(x[:10000], Fs=22050, NFFT=256)
a1.plot(x[:10000])
a3.imshow(F.mfcc_centroid(x[:10000], 22050., 10.)[0].T)
In [85]:
reload(F)
Out[85]:
In [95]:
f, a = plt.subplots(1, 4, figsize=(14, 5))
qs = [5., 10., 20., 30.]
for (ax, q) in zip(a.flat, qs):
c = F.mfcc_centroid(x, fs=22050,
mfcc_kwargs={'nwin': 256, 'nceps': 13,
'nfft': 512, 'drop':1},
filter_percentile=q)
ax.imshow(c)
In [91]:
plt.imshow(F.mfcc_atomic(x[:10000], 22050. ,drop=1).T);
In [94]:
plt.imshow(F.low_energy_filter(F.mfcc_atomic(x[:10000], 22050., drop=1), 15).T)
Out[94]:
In [79]:
q = np.array([[1, 2],[2, 3]])[:, 1:]
In [80]:
q
Out[80]:
In [ ]: