In [1]:
%pylab --no-import-all inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from tone_pallettes import dataset as patterns
In [2]:
values = patterns('processed/')
SAR = pd.read_table("speech_archive_meta.tsv")
master = pd.merge(SAR, values, how='inner',
left_on=['first_language', 'recording_num'],
right_on=['lang', 'num'])
def plot_hist(column, n=30, **kwargs):
return master[column].groupby(master[column]).size().order(ascending=False)[:n].plot(kind='bar', **kwargs)
ax = plot_hist('first_language', 10, title='GMU-SAA: Top 10 Languages', color='gray')
ax.set_xlabel('First Language')
ax.set_ylabel('Number of Samples')
Out[2]:
In [3]:
from collections import defaultdict
lang_supergroup = defaultdict(lambda: 'other')
lang_supergroup.update({k: 'italic' for k in (
'spanish', 'french', 'portuguese',
'romanian', 'occitan', 'catalan', 'italian',)})
lang_supergroup.update({k: 'germanic' for k in ('english', 'dutch', 'german', 'swedish',
'danish', 'norwegian', 'icelandic')})
lang_supergroup.update({k: 'slavic' for k in ('russian', 'polish', 'serbian', 'macedonian',
'bulgarian', 'ukrainian', 'armenian', 'czech',
'bosnian')})
lang_supergroup.update({k: 'indo-iranian' for k in ('hindi', 'urdu', 'bangali',
'gujarati', 'kurdish',
'awadhi', 'punjabi', 'farsi')})
lang_supergroup.update({k: 'east-asian' for k in ('japanese', 'mandarin', 'korean',
'cantonese', 'vietnamese')})
#lang_supergroup.update({k: 'turkic' for k in ('turkish', 'azerbaijani', 'gagauz', 'oghuz')})
lang_supergroup.update({k: 'afro-asiatic' for k in ('arabic', 'amharic')})
master['is_english'] = master.apply(lambda row: row['first_language'] == 'english', axis=1)
master['lang_supergroup'] = master.apply(lambda row: lang_supergroup[row['first_language']], axis=1)
In [4]:
ax2 = plot_hist('lang_supergroup', 10, title="GMU-SAA: Linguistic Grouping", color='gray')
ax2.set_xlabel("Language Group")
ax2.set_ylabel("Number of Samples")
Out[4]:
In [62]:
s = master['shape'][0]
plt.imshow(abs(master.data[np.random.randint(0, 1200)].reshape(master['shape'][1240])), cmap='gray_r')
plt.colorbar()
plt.title("Absolute Value %i x %i Tone-Pallette" % s)
plt.xlabel("$MFCC + MFCC^\prime + MFCC^{\prime\prime}$")
plt.ylabel("Cluster Centroid")
Out[62]:
In [28]:
# The Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score, train_test_split
LE = LabelEncoder().fit(master.is_english)
X_orig = np.vstack(master.data.values)
X_normalized = X_orig / np.c_[np.sqrt(np.einsum('...i,...i', X_orig, X_orig))] # for fun
y_orig = LE.transform(master.is_english)
# Shuffle
X, y = shuffle(X_orig, y_orig)
# IMPORTANT! THIS HANDLES ALL CROSS VALIDATION
skf = StratifiedKFold(y, n_folds=8)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.metrics import classification_report
In [25]:
p = Pipeline([#('scale', StandardScaler()),
('pca', PCA(n_components=200)),
('svm', LinearSVC(class_weight='auto', C=1))])
cvc = cross_val_score(p, X, y, cv=skf, n_jobs=1, scoring='f1')
In [26]:
p.fit(X_train, y_train)
y_pred = p.predict(X_test)
In [27]:
print classification_report(y_test, y_pred)
confusion_matrix(y_test, y_pred)
Out[27]:
In [44]:
# Grid Search
from sklearn.grid_search import GridSearchCV
params = {'svm__C': [1.2],
'pca__n_components': [80, 70, 50]}
gs = GridSearchCV(p, params, verbose=1, cv=skf, n_jobs=1, scoring='f1')
In [45]:
gs.fit(X, y)
Out[45]:
In [48]:
gs.best_params_
Out[48]:
In [49]:
cross_val_scor
Out[49]:
In [50]:
best = Pipeline([('pca', PCA(n_components=50)),
('svm', LinearSVC(class_weight='auto', C=1.2))])
best.fit(X_train, y_train)
y_pred = best.predict(X_test)
print classification_report(y_test, y_pred)
In [54]:
naive = Pipeline([#('pca', PCA(n_components=50)),
('svm', LinearSVC(class_weight='auto', C=1.2))])
naive.fit(X_train, y_train)
y_pred2 = naive.predict(X_test)
print classification_report(y_test, y_pred2)
In [63]:
def pca_trial(n, cv):
p = Pipeline([('pca', PCA(n_components=n)),
('svm', LinearSVC(class_weight='auto', C=1.2))])
return cross_val_score(p, X, y, scoring='f1', cv=cv)
In [64]:
components = [4, 10, 25, 50, 80, 100, 200, 300, 400, 500]
experiment = [pca_trial(n, skf) for n in components]
In [65]:
stats = np.vstack([np.array([z.mean(), z.std()]) for z in experiment])
In [66]:
stats
Out[66]:
In [70]:
f, ax = plt.subplots(1, 1)
ax.errorbar(np.array(components), stats[:,0], yerr=stats[:,1])
ax.set_xscale("log")
ax.grid()
ax.set_title("Dimensionality Reduction Efficacy\nvia 8-fold cross-validation")
ax.set_xlabel("Number of Principal Components")
ax.set_ylabel("F1 Score")
Out[70]:
In [ ]: