In [2]:
from sklearn.cluster import KMeans, AffinityPropagation, SpectralClustering
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
import librosa
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import os
import numpy as np
import pandas as pd
In [3]:
features_file = '../data/gender/voice.csv'
voice_df = pd.read_csv(features_file)
print 'Columns: ', list(voice_df.columns)
In [4]:
features = list(voice_df.columns[:-1])
print 'Voice Features: ', features
In [5]:
print 'No. of training samples for class "male":', voice_df[voice_df['label'] == 'male'].shape[0]
In [6]:
voice_df[voice_df['label'] == 'male'].head()
Out[6]:
In [7]:
print 'No. of training samples for class "female":', voice_df[voice_df['label'] == 'female'].shape[0]
In [8]:
voice_df[voice_df['label'] == 'female'].head()
Out[8]:
In [122]:
from sklearn.decomposition import PCA
from sklearn import preprocessing
data_scaled = pd.DataFrame(preprocessing.scale(voice_df[features]), columns = voice_df[features].columns)
pca = PCA(n_components=5)
pca_results = pca.fit_transform(data_scaled)
print 'Shape of the transformed feature vector:', pca_results.shape
print 'Original training sample:', list(voice_df[features].loc[0].values)
print 'Training sample after PCA:', list(pca_results[0])
print '\n'
# Percentage of variance explained for each components
print 'Explained variance ratio (first five components)'
print '------------------------------------------------'
for idx, r in enumerate(pca.explained_variance_ratio_):
print 'Principal Component', idx, ':', r
In [125]:
print pd.DataFrame(pca.components_,columns=data_scaled.columns,index = ['PC-1','PC-2', 'PC-3', 'PC-3', 'PC-5']).transpose()
In [126]:
from ggplot import *
df_pca = voice_df.copy()
df_pca['x-pca'] = pca_results[:,0]
df_pca['y-pca'] = pca_results[:,1]
chart = ggplot( df_pca, aes(x='x-pca', y='y-pca', color='label') ) \
+ geom_point(size=75,alpha=0.8) \
+ ggtitle("First and Second Principal Components colored by gender")
chart
Out[126]:
In [128]:
from collections import Counter
In [129]:
spectral = SpectralClustering(num_clusters).fit(voice_df[features])
In [131]:
labels = spectral.labels_
print labels
print Counter(labels)
In [132]:
from sklearn.manifold import TSNE
print("Computing t-SNE embedding")
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(voice_df[features])
In [133]:
df_tsne = voice_df.copy()
df_tsne['x-tsne'] = tsne_results[:,0]
df_tsne['y-tsne'] = tsne_results[:,1]
chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
+ geom_point(size=70,alpha=0.1) \
+ ggtitle("tSNE dimensions colored by gender")
chart
Out[133]:
In [141]:
from sklearn.manifold import TSNE
print("Computing t-SNE embedding")
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(pca_results)
In [142]:
df_tsne = voice_df.copy()
df_tsne['x-tsne'] = tsne_results[:,0]
df_tsne['y-tsne'] = tsne_results[:,1]
chart = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
+ geom_point(size=70,alpha=0.1) \
+ ggtitle("tSNE dimensions colored by gender")
chart
Out[142]:
In [20]:
# Let's look at classifying this data.
# Let's create a baseline classifier using Logistic Regression.
# Divide the data into training and test datasets.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(voice_df[features],
voice_df['label'],
test_size=0.15,
random_state=42)
In [21]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
model_lr.score(X_train, y_train)
Out[21]:
In [23]:
model_lr.score(X_test, y_test)
Out[23]:
In [ ]: