notebook.community

Edit and run



In [2]:

    
from sklearn.cluster import KMeans, AffinityPropagation, SpectralClustering
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
import librosa
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import os
import numpy as np
import pandas as pd



In [3]:

    
features_file = '../data/gender/voice.csv'
voice_df = pd.read_csv(features_file)
print 'Columns: ', list(voice_df.columns)









    



Columns:  ['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt', 'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun', 'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx', 'label']



In [4]:

    
features = list(voice_df.columns[:-1])
print 'Voice Features: ', features









    



Voice Features:  ['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt', 'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun', 'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx']



In [5]:

    
print 'No. of training samples for class "male":', voice_df[voice_df['label'] == 'male'].shape[0]









    



No. of training samples for class "male": 1584



In [6]:

    
voice_df[voice_df['label'] == 'male'].head()









    Out[6]:







  
    
      
      meanfreq
      sd
      median
      Q25
      Q75
      IQR
      skew
      kurt
      sp.ent
      sfm
      ...
      centroid
      meanfun
      minfun
      maxfun
      meandom
      mindom
      maxdom
      dfrange
      modindx
      label
    
  
  
    
      0
      0.059781
      0.064241
      0.032027
      0.015071
      0.090193
      0.075122
      12.863462
      274.402906
      0.893369
      0.491918
      ...
      0.059781
      0.084279
      0.015702
      0.275862
      0.007812
      0.007812
      0.007812
      0.000000
      0.000000
      male
    
    
      1
      0.066009
      0.067310
      0.040229
      0.019414
      0.092666
      0.073252
      22.423285
      634.613855
      0.892193
      0.513724
      ...
      0.066009
      0.107937
      0.015826
      0.250000
      0.009014
      0.007812
      0.054688
      0.046875
      0.052632
      male
    
    
      2
      0.077316
      0.083829
      0.036718
      0.008701
      0.131908
      0.123207
      30.757155
      1024.927705
      0.846389
      0.478905
      ...
      0.077316
      0.098706
      0.015656
      0.271186
      0.007990
      0.007812
      0.015625
      0.007812
      0.046512
      male
    
    
      3
      0.151228
      0.072111
      0.158011
      0.096582
      0.207955
      0.111374
      1.232831
      4.177296
      0.963322
      0.727232
      ...
      0.151228
      0.088965
      0.017798
      0.250000
      0.201497
      0.007812
      0.562500
      0.554688
      0.247119
      male
    
    
      4
      0.135120
      0.079146
      0.124656
      0.078720
      0.206045
      0.127325
      1.101174
      4.333713
      0.971955
      0.783568
      ...
      0.135120
      0.106398
      0.016931
      0.266667
      0.712812
      0.007812
      5.484375
      5.476562
      0.208274
      male
    
  

5 rows × 21 columns



In [7]:

    
print 'No. of training samples for class "female":', voice_df[voice_df['label'] == 'female'].shape[0]









    



No. of training samples for class "female": 1584



In [8]:

    
voice_df[voice_df['label'] == 'female'].head()









    Out[8]:







  
    
      
      meanfreq
      sd
      median
      Q25
      Q75
      IQR
      skew
      kurt
      sp.ent
      sfm
      ...
      centroid
      meanfun
      minfun
      maxfun
      meandom
      mindom
      maxdom
      dfrange
      modindx
      label
    
  
  
    
      1584
      0.158108
      0.082782
      0.191191
      0.062350
      0.224552
      0.162202
      2.801344
      19.929617
      0.952161
      0.679223
      ...
      0.158108
      0.185042
      0.023022
      0.275862
      0.272964
      0.046875
      0.742188
      0.695312
      0.339888
      female
    
    
      1585
      0.182855
      0.067789
      0.200639
      0.175489
      0.226068
      0.050579
      3.001890
      19.865482
      0.910458
      0.506099
      ...
      0.182855
      0.159590
      0.018713
      0.266667
      0.258970
      0.054688
      0.804688
      0.750000
      0.269231
      female
    
    
      1586
      0.199807
      0.061974
      0.211358
      0.184422
      0.235687
      0.051265
      2.543841
      14.921964
      0.904432
      0.425289
      ...
      0.199807
      0.156465
      0.016194
      0.266667
      0.250446
      0.054688
      0.898438
      0.843750
      0.329521
      female
    
    
      1587
      0.195280
      0.072087
      0.204656
      0.180611
      0.255954
      0.075344
      2.392326
      10.061489
      0.907115
      0.524209
      ...
      0.195280
      0.182629
      0.024922
      0.275862
      0.269531
      0.054688
      0.703125
      0.648438
      0.294717
      female
    
    
      1588
      0.208504
      0.057550
      0.220229
      0.190343
      0.249759
      0.059416
      1.707786
      5.670912
      0.879674
      0.343548
      ...
      0.208504
      0.162043
      0.016807
      0.262295
      0.260789
      0.054688
      0.812500
      0.757812
      0.251546
      female
    
  

5 rows × 21 columns



In [122]:

    
from sklearn.decomposition import PCA
from sklearn import preprocessing
data_scaled = pd.DataFrame(preprocessing.scale(voice_df[features]), columns = voice_df[features].columns)
pca = PCA(n_components=5)
pca_results = pca.fit_transform(data_scaled)

print 'Shape of the transformed feature vector:', pca_results.shape
print 'Original training sample:', list(voice_df[features].loc[0].values)
print 'Training sample after PCA:', list(pca_results[0])
print '\n'
# Percentage of variance explained for each components
print 'Explained variance ratio (first five components)'
print '------------------------------------------------'
for idx, r in enumerate(pca.explained_variance_ratio_):
    print 'Principal Component', idx, ':', r









    



Shape of the transformed feature vector: (3168, 5)
Original training sample: [0.059780984959808103, 0.064241267703135901, 0.032026913372582004, 0.015071488645920899, 0.0901934398654331, 0.075121951219512206, 12.8634618371626, 274.40290550206697, 0.89336941670080705, 0.49191776639781104, 0.0, 0.059780984959808103, 0.084279106440321008, 0.0157016683022571, 0.27586206896551696, 0.0078125, 0.0078125, 0.0078125, 0.0, 0.0]
Training sample after PCA: [8.2085163104916408, 2.1644883625918574, 1.9597839267435693, 5.4524168658181935, 0.93085097538232764]


Explained variance ratio (first five components)
------------------------------------------------
Principal Component 0 : 0.452163907842
Principal Component 1 : 0.118706090432
Principal Component 2 : 0.109099393016
Principal Component 3 : 0.0761976317025
Principal Component 4 : 0.0529393770703



In [125]:

    
print pd.DataFrame(pca.components_,columns=data_scaled.columns,index = ['PC-1','PC-2', 'PC-3', 'PC-3', 'PC-5']).transpose()









    



              PC-1      PC-2      PC-3      PC-3      PC-5
meanfreq -0.313361  0.034717  0.015924 -0.246570 -0.038612
sd        0.279772 -0.202719  0.090184 -0.214463  0.014912
median   -0.279302  0.007949  0.042971 -0.346672 -0.049277
Q25      -0.303399  0.155775 -0.065598 -0.052000  0.036864
Q75      -0.188523 -0.188194  0.116800 -0.559204 -0.089888
IQR       0.241049 -0.281229  0.139175 -0.249815 -0.091611
skew      0.130798  0.305776  0.505249 -0.011908 -0.083897
kurt      0.131871  0.248367  0.526860 -0.045228 -0.098408
sp.ent    0.222616 -0.356778 -0.204808 -0.128805 -0.043018
sfm       0.274429 -0.232113 -0.106867 -0.023960  0.034825
mode     -0.243130 -0.112073 -0.099279 -0.146386 -0.108420
centroid -0.313361  0.034717  0.015924 -0.246570 -0.038612
meanfun  -0.187707  0.128549 -0.012669  0.122884  0.536686
minfun   -0.158146 -0.063079 -0.029525  0.027492  0.292539
maxfun   -0.110543 -0.250008  0.259089  0.024257  0.539716
meandom  -0.226033 -0.269057  0.048765  0.243001 -0.182453
mindom   -0.088938  0.304831 -0.210968  0.082080 -0.390600
maxdom   -0.229175 -0.308728  0.145942  0.321858 -0.215490
dfrange  -0.227649 -0.314308  0.149782  0.320484 -0.208535
modindx   0.085684  0.162240 -0.444228 -0.071380  0.065661



In [126]:

    
from ggplot import *

df_pca = voice_df.copy()
df_pca['x-pca'] = pca_results[:,0]
df_pca['y-pca'] = pca_results[:,1]
chart = ggplot( df_pca, aes(x='x-pca', y='y-pca', color='label') ) \
        + geom_point(size=75,alpha=0.8) \
        + ggtitle("First and Second Principal Components colored by gender")
chart









    












    Out[126]:





<ggplot: (8775117384881)>



In [128]:

    
from collections import Counter



In [129]:

    
spectral = SpectralClustering(num_clusters).fit(voice_df[features])



In [131]:

    
labels = spectral.labels_
print labels
print Counter(labels)









    



[0 0 0 ..., 1 1 1]
Counter({0: 1892, 1: 1276})



In [132]:

    
from sklearn.manifold import TSNE

print("Computing t-SNE embedding")
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(voice_df[features])









    



Computing t-SNE embedding
[t-SNE] Computing pairwise distances...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 3168
[t-SNE] Computed conditional probabilities for sample 2000 / 3168
[t-SNE] Computed conditional probabilities for sample 3000 / 3168
[t-SNE] Computed conditional probabilities for sample 3168 / 3168
[t-SNE] Mean sigma: 0.626971
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.093733
[t-SNE] Error after 300 iterations: 1.093733



In [133]:

    
df_tsne = voice_df.copy()
df_tsne['x-tsne'] = tsne_results[:,0]
df_tsne['y-tsne'] = tsne_results[:,1]

chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
        + geom_point(size=70,alpha=0.1) \
        + ggtitle("tSNE dimensions colored by gender")
chart









    












    Out[133]:





<ggplot: (8775113109957)>



In [141]:

    
from sklearn.manifold import TSNE

print("Computing t-SNE embedding")
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(pca_results)









    



Computing t-SNE embedding
[t-SNE] Computing pairwise distances...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 3168
[t-SNE] Computed conditional probabilities for sample 2000 / 3168
[t-SNE] Computed conditional probabilities for sample 3000 / 3168
[t-SNE] Computed conditional probabilities for sample 3168 / 3168
[t-SNE] Mean sigma: 0.583961
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.220931
[t-SNE] Error after 300 iterations: 1.220931



In [142]:

    
df_tsne = voice_df.copy()
df_tsne['x-tsne'] = tsne_results[:,0]
df_tsne['y-tsne'] = tsne_results[:,1]

chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
        + geom_point(size=70,alpha=0.1) \
        + ggtitle("tSNE dimensions colored by gender")
chart









    












    Out[142]:





<ggplot: (8775112800721)>



In [20]:

    
# Let's look at classifying this data.
# Let's create a baseline classifier using Logistic Regression.

# Divide the data into training and test datasets.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(voice_df[features], 
                                                    voice_df['label'], 
                                                    test_size=0.15, 
                                                    random_state=42)



In [21]:

    
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
model_lr.score(X_train, y_train)









    Out[21]:





0.90936106983655274



In [23]:

    
model_lr.score(X_test, y_test)









    Out[23]:





0.92647058823529416



In [ ]:

	meanfreq	sd	median	Q25	Q75	IQR	skew	kurt	sp.ent	sfm	...	centroid	meanfun	minfun	maxfun	meandom	mindom	maxdom	dfrange	modindx	label
0	0.059781	0.064241	0.032027	0.015071	0.090193	0.075122	12.863462	274.402906	0.893369	0.491918	...	0.059781	0.084279	0.015702	0.275862	0.007812	0.007812	0.007812	0.000000	0.000000	male
1	0.066009	0.067310	0.040229	0.019414	0.092666	0.073252	22.423285	634.613855	0.892193	0.513724	...	0.066009	0.107937	0.015826	0.250000	0.009014	0.007812	0.054688	0.046875	0.052632	male
2	0.077316	0.083829	0.036718	0.008701	0.131908	0.123207	30.757155	1024.927705	0.846389	0.478905	...	0.077316	0.098706	0.015656	0.271186	0.007990	0.007812	0.015625	0.007812	0.046512	male
3	0.151228	0.072111	0.158011	0.096582	0.207955	0.111374	1.232831	4.177296	0.963322	0.727232	...	0.151228	0.088965	0.017798	0.250000	0.201497	0.007812	0.562500	0.554688	0.247119	male
4	0.135120	0.079146	0.124656	0.078720	0.206045	0.127325	1.101174	4.333713	0.971955	0.783568	...	0.135120	0.106398	0.016931	0.266667	0.712812	0.007812	5.484375	5.476562	0.208274	male

	meanfreq	sd	median	Q25	Q75	IQR	skew	kurt	sp.ent	sfm	...	centroid	meanfun	minfun	maxfun	meandom	mindom	maxdom	dfrange	modindx	label
1584	0.158108	0.082782	0.191191	0.062350	0.224552	0.162202	2.801344	19.929617	0.952161	0.679223	...	0.158108	0.185042	0.023022	0.275862	0.272964	0.046875	0.742188	0.695312	0.339888	female
1585	0.182855	0.067789	0.200639	0.175489	0.226068	0.050579	3.001890	19.865482	0.910458	0.506099	...	0.182855	0.159590	0.018713	0.266667	0.258970	0.054688	0.804688	0.750000	0.269231	female
1586	0.199807	0.061974	0.211358	0.184422	0.235687	0.051265	2.543841	14.921964	0.904432	0.425289	...	0.199807	0.156465	0.016194	0.266667	0.250446	0.054688	0.898438	0.843750	0.329521	female
1587	0.195280	0.072087	0.204656	0.180611	0.255954	0.075344	2.392326	10.061489	0.907115	0.524209	...	0.195280	0.182629	0.024922	0.275862	0.269531	0.054688	0.703125	0.648438	0.294717	female
1588	0.208504	0.057550	0.220229	0.190343	0.249759	0.059416	1.707786	5.670912	0.879674	0.343548	...	0.208504	0.162043	0.016807	0.262295	0.260789	0.054688	0.812500	0.757812	0.251546	female