In [1]:
%pylab --no-import-all inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from tone_pallettes import dataset as patterns


Populating the interactive namespace from numpy and matplotlib

The master Dataset


In [2]:
values = patterns('processed/')
SAR = pd.read_table("speech_archive_meta.tsv")
master = pd.merge(SAR, values, how='inner', 
         left_on=['first_language', 'recording_num'], 
         right_on=['lang', 'num'])

def plot_hist(column, n=30, **kwargs):
    return master[column].groupby(master[column]).size().order(ascending=False)[:n].plot(kind='bar', **kwargs)

ax = plot_hist('first_language', 10, title='GMU-SAA: Top 10 Languages', color='gray')
ax.set_xlabel('First Language')
ax.set_ylabel('Number of Samples')


Out[2]:
<matplotlib.text.Text at 0x10665d910>

In [3]:
from collections import defaultdict

lang_supergroup = defaultdict(lambda: 'other')
lang_supergroup.update({k: 'italic' for k in (
                        'spanish', 'french', 'portuguese',
                        'romanian', 'occitan', 'catalan', 'italian',)})
lang_supergroup.update({k: 'germanic' for k in ('english', 'dutch', 'german', 'swedish',
                                                'danish', 'norwegian', 'icelandic')})
lang_supergroup.update({k: 'slavic' for k in ('russian', 'polish', 'serbian', 'macedonian', 
                                              'bulgarian', 'ukrainian', 'armenian', 'czech',
                                              'bosnian')})
lang_supergroup.update({k: 'indo-iranian' for k in ('hindi', 'urdu', 'bangali', 
                                                    'gujarati', 'kurdish',
                                                    'awadhi', 'punjabi', 'farsi')})
lang_supergroup.update({k: 'east-asian' for k in ('japanese', 'mandarin', 'korean', 
                                                  'cantonese', 'vietnamese')})
#lang_supergroup.update({k: 'turkic' for k in ('turkish', 'azerbaijani', 'gagauz', 'oghuz')})
lang_supergroup.update({k: 'afro-asiatic' for k in ('arabic', 'amharic')})

                        
master['is_english'] = master.apply(lambda row: row['first_language'] == 'english', axis=1)
master['lang_supergroup'] = master.apply(lambda row: lang_supergroup[row['first_language']], axis=1)

In [4]:
ax2 = plot_hist('lang_supergroup', 10, title="GMU-SAA: Linguistic Grouping", color='gray')
ax2.set_xlabel("Language Group")
ax2.set_ylabel("Number of Samples")


Out[4]:
<matplotlib.text.Text at 0x106993b50>

Preprocessing

  1. Encode Labels
  2. Set up cross-validation
  3. shuffle

In [62]:
s = master['shape'][0]
plt.imshow(abs(master.data[np.random.randint(0, 1200)].reshape(master['shape'][1240])), cmap='gray_r')
plt.colorbar()
plt.title("Absolute Value %i x %i Tone-Pallette" % s)
plt.xlabel("$MFCC + MFCC^\prime + MFCC^{\prime\prime}$")
plt.ylabel("Cluster Centroid")


Out[62]:
<matplotlib.text.Text at 0x10bb3d4d0>

In [28]:
# The Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score, train_test_split

LE = LabelEncoder().fit(master.is_english)
X_orig = np.vstack(master.data.values)
X_normalized = X_orig / np.c_[np.sqrt(np.einsum('...i,...i', X_orig, X_orig))]  # for fun
y_orig = LE.transform(master.is_english)

# Shuffle
X, y = shuffle(X_orig, y_orig)
# IMPORTANT! THIS HANDLES ALL CROSS VALIDATION
skf = StratifiedKFold(y, n_folds=8)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

SVM pipeline

  1. Center/Scale the (training) features to be centred on 0, with variance 1.
  2. Reduce the dimensionality of the features (either through picking 'the best' subset, or through pca)
  3. use SVM to classify.
  4. Optimize parameters using a grid-search.

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.metrics import classification_report

In [25]:
p = Pipeline([#('scale', StandardScaler()),
              ('pca', PCA(n_components=200)),
              ('svm', LinearSVC(class_weight='auto', C=1))])
cvc = cross_val_score(p, X, y, cv=skf, n_jobs=1, scoring='f1')

In [26]:
p.fit(X_train, y_train)
y_pred = p.predict(X_test)

In [27]:
print classification_report(y_test, y_pred)
confusion_matrix(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.86      0.69      0.76       191
          1       0.45      0.70      0.55        71

avg / total       0.75      0.69      0.71       262

Out[27]:
array([[131,  60],
       [ 21,  50]])

In [44]:
# Grid Search
from sklearn.grid_search import GridSearchCV

params = {'svm__C': [1.2],
          'pca__n_components': [80, 70, 50]}
gs = GridSearchCV(p, params, verbose=1, cv=skf, n_jobs=1, scoring='f1')

In [45]:
gs.fit(X, y)


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    6.2s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  2.4min finished
Fitting 8 folds for each of 3 candidates, totalling 24 fits
Out[45]:
GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 1 1], n_folds=8),
       estimator=Pipeline(steps=[('pca', PCA(copy=True, n_components=200, whiten=False)), ('svm', LinearSVC(C=1, class_weight='auto', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'pca__n_components': [80, 70, 50], 'svm__C': [1.2]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring='f1',
       verbose=1)

In [48]:
gs.best_params_


Out[48]:
{'pca__n_components': 50, 'svm__C': 1.2}

In [49]:
cross_val_scor


Out[49]:
0.63227547045749177

In [50]:
best = Pipeline([('pca', PCA(n_components=50)),
                ('svm', LinearSVC(class_weight='auto', C=1.2))])
best.fit(X_train, y_train)
y_pred = best.predict(X_test)
print classification_report(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.89      0.77      0.83       184
          1       0.59      0.77      0.67        78

avg / total       0.80      0.77      0.78       262


In [54]:
naive = Pipeline([#('pca', PCA(n_components=50)),
                ('svm', LinearSVC(class_weight='auto', C=1.2))])
naive.fit(X_train, y_train)
y_pred2 = naive.predict(X_test)
print classification_report(y_test, y_pred2)


             precision    recall  f1-score   support

          0       0.80      0.79      0.80       184
          1       0.52      0.53      0.52        78

avg / total       0.71      0.71      0.71       262


In [63]:
def pca_trial(n, cv):
    p = Pipeline([('pca', PCA(n_components=n)),
                  ('svm', LinearSVC(class_weight='auto', C=1.2))])
    return cross_val_score(p, X, y, scoring='f1', cv=cv)

In [64]:
components = [4, 10, 25, 50, 80, 100, 200, 300, 400, 500]
experiment = [pca_trial(n, skf) for n in components]

In [65]:
stats = np.vstack([np.array([z.mean(), z.std()]) for z in experiment])

In [66]:
stats


Out[66]:
array([[ 0.60061489,  0.02068532],
       [ 0.6076747 ,  0.02375075],
       [ 0.59471268,  0.0312705 ],
       [ 0.6325593 ,  0.01480605],
       [ 0.60082113,  0.02579281],
       [ 0.60542094,  0.02016737],
       [ 0.5735923 ,  0.04689493],
       [ 0.55918807,  0.03172556],
       [ 0.53523438,  0.02215423],
       [ 0.49833627,  0.02089189]])

In [70]:
f, ax = plt.subplots(1, 1)
ax.errorbar(np.array(components), stats[:,0], yerr=stats[:,1])
ax.set_xscale("log")
ax.grid()
ax.set_title("Dimensionality Reduction Efficacy\nvia 8-fold cross-validation")
ax.set_xlabel("Number of Principal Components")
ax.set_ylabel("F1 Score")


Out[70]:
<matplotlib.text.Text at 0x10cdbd650>

In [ ]: