In [ ]:
%matplotlib inline
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
from IPython.display import Image, display, Audio
In [ ]:
def load_features(directory):
au_features = pd.read_csv('{}/{}/audio_features.csv'.format('../data/output/features',directory), index_col=0)
im_features = pd.read_csv('{}/{}/image_features.csv'.format('../data/output/features',directory), index_col=0)
# Drop redundant columns
im_features = im_features.drop(['label'], axis=1)
# Merge audio and image features
features = pd.concat([au_features, im_features], axis=1)
# Only look at clips less than 300s long
features = features[features.length < 300]
return features
In [ ]:
features = load_features('train')
features.head()
In [ ]:
print(len(features[features.isnull().any(axis=1)]))
features[features.isnull().any(axis=1)].head()
In [ ]:
# just drop the remaning rows with nan values
features = features.dropna()
In [ ]:
f = features
features_1 = f[f.label == 1]
In [ ]:
from sklearn import preprocessing
# See if we can distinguish voice mail clips from the others
# Features to use
columns = ['length', 'ring_count', 'last_ring_to_end', 'percent_silence', 'white_proportion']
X_train_all = features_1[columns]
#features_1 = features_1[['length', 'last_ring_to_end_length', 'white_proportion']]
scaler = preprocessing.StandardScaler().fit(X_train_all)
X_train_all_scaled = scaler.transform(X_train_all)
In [ ]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
labels = kmeans.fit_predict(X_train_all_scaled)
# Look at cluster sizes
unique, counts = np.unique(labels, return_counts=True)
print(counts)
In [ ]:
# Look at images by cluster to see if they seem to make sense
images = features_1['image_file']
clusters = [[] for _ in range(max(labels)+1)]
for label, img in zip(labels, images):
clusters[label].append(img)
In [ ]:
# Cluster 1 random selection
for img in random.sample(clusters[0], 10):
display(Image(filename=img, width=320))
In [ ]:
# Cluster 2 random selection
for img in random.sample(clusters[1], 10):
display(Image(filename=img, width=320))
In [ ]:
# Cluster 2 random selection
for img in clusters[2]:
display(Image(filename=img, width=320))
In [ ]:
from sklearn.manifold import TSNE
# http://alexanderfabisch.github.io/t-sne-in-scikit-learn.html
# http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
X_tsne = TSNE(n_components=2, verbose=2).fit_transform(X_train_all_scaled)
In [ ]:
# Plot tsne results
plt.scatter(X_tsne[:,0], X_tsne[:,1], c=labels)
In [ ]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = PCA(n_components=2).fit_transform(X_train_all_scaled)
In [ ]:
# Plot PCA results
plt.scatter(X_pca[:,0], X_pca[:,1], c=labels)