In [ ]:
#coding: utf-8

import warnings
warnings.simplefilter("ignore")

import os
import sys 
import wave
import time
import librosa
import numpy as np
from utils import *
import matplotlib.pyplot as plt
from IPython.display import Audio
from StringIO import StringIO

%pylab inline

In [ ]:
!wget https://www.dropbox.com/s/jtdc7y0bi00ii4p/genres.tar.gz?dl=0 -O genres.tar.gz 
!tar -xzf genres.tar.gz

In [ ]:
import IPython
from IPython import display

def Audio(url):
    return display.HTML("<center><audio controls><source src='{}'></audio>".format(url))

In [ ]:
sound_file = './genres/blues/blues.00000.au'

y, sr = librosa.load(sound_file)
librosa.output.write_wav('./genres/tmp.wav', y, sr, norm=True)
Audio(url='./genres/tmp.wav')

Sound as 1D-Signal


In [ ]:
plt.figure(figsize=(20,4))
pylab.plot(np.arange(len(y)) * 1.0 /sr, y, 'k')
pylab.xlim([0, 10])
pylab.show()

Sound as 2D-Signal


In [ ]:
S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
log_S = librosa.logamplitude(S, ref_power=np.max)

In [ ]:
plt.figure(figsize=(20,4))
librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel', cmap='hot')
plt.title('mel power spectrogram')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()

In [ ]:
def get_spectgorgamm(fname):
    y, sr = librosa.load(fname)
    S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
    log_S = librosa.logamplitude(S, ref_power=np.max)
    
    return log_S[:, :1200]

def plot_spectrogramm(log_S):
    plt.figure(figsize=(20,4))
    librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel', cmap='hot')
    plt.title('mel power spectrogram')
    plt.colorbar(format='%+02.0f dB')
    plt.tight_layout()

Prepare a data


In [ ]:
geners = ['blues', 'country', 'hiphop', 'metal', 'reggae', 'classical', 'disco', 'jazz', 'pop', 'rock']

id2gener = dict()
X_names, y = [], []
for gener_id, gener in enumerate(geners):
    id2gener[gener_id] = gener
    for track in os.listdir('./genres/' + gener):
        if '.mp3' in track or '.au' in track and '_' not in track:
            trackfile = os.path.join('./genres/', gener, track)
            X_names.append(trackfile)
            y.append(gener_id)

In [ ]:
from multiprocessing import Pool

ncpu = 4
X = Pool(ncpu).map(get_spectgorgamm, X_names)

Nearest Neighbors genre classification


In [ ]:
perm = np.random.permutation(len(y))
X, X_names, y = np.array(X)[perm].astype('float32'), np.array(X_names)[perm], np.array(y)[perm]
Xreshape = X.reshape(X.shape[0], X.shape[1], X.shape[2])

X_train, X_valid = Xreshape[:800], Xreshape[800:]
y_train, y_valid = y[:800], y[800:]

In [ ]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_jobs=ncpu)

clf = <train clf>
y_val_pred = <make prediction on valid set>

print accuracy_score(y_valid, y_val_pred)

Convolution Nural Nets


In [ ]:
import theano
import lasagne
import theano.tensor as T

In [ ]:
perm = np.random.permutation(len(y))
X, y = np.array(X)[perm].astype('float32'), np.array(y)[perm]
Xreshape = X.reshape(X.shape[0], X.shape[1], X.shape[2])

X_train, X_valid = Xreshape[:800], Xreshape[800:]
y_train, y_valid = y[:800], y[800:]

input_X, target_y = T.tensor3("X", dtype='float64'), T.vector("y", dtype='int32')
nn = lasagne.layers.InputLayer(shape=(None, X.shape[1], X.shape[2]), input_var=input_X) 

nn = <Build convnet using Conv1DLayer MaxPool1DLayer>
nn = <Add several DenseLayers and DropoutLayer>
nn = lasagne.layers.DenseLayer(nn, 10, nonlinearity=lasagne.nonlinearities.softmax)

In [ ]:
y_predicted = lasagne.layers.get_output(nn)
all_weights = lasagne.layers.get_all_params(nn)

loss = lasagne.objectives.categorical_crossentropy(y_predicted, target_y).mean()
accuracy = lasagne.objectives.categorical_accuracy(y_predicted, target_y).mean()
updates_sgd = <Your favorite optimizer>

In [ ]:
train_fun = theano.function([input_X, target_y], [loss, accuracy], allow_input_downcast=True, updates=updates_sgd)
test_fun  = theano.function([input_X, target_y], [loss, accuracy], allow_input_downcast=True)

In [ ]:
%%time 

conv_nn = train_net(nn, train_fun, test_fun, X_train, y_train, X_valid, y_valid, num_epochs=10, batch_size=50)

In [ ]:
plt.figure(figsize=(5, 5), dpi=500)
W = lasagne.layers.get_all_params(nn)[0].get_value()
W[::2, :, :] = 0.2
W = np.hstack(W)
pylab.imshow(W, cmap='hot', interpolation="nearest")
pylab.axis('off')
pylab.show()

Find Simular Tracks


In [ ]:
from sklearn.neighbors import NearestNeighbors

In [ ]:
represent = <Get features from last but one layer>
represent_fun = theano.function([input_X], [represent], allow_input_downcast=True)

In [ ]:
f = lambda x: np.array(represent_fun([x])[0])
track_vectors = map(f, X_train) + map(f, X_valid)
track_vectors = np.concatenate(track_vectors, axis=0)

In [ ]:
nn_pred = NearestNeighbors(metric='cosine', algorithm='brute')

In [ ]:
nn_pred = nn_pred.fit(track_vectors)

In [ ]:
X_names[0]

In [ ]:
ans = list(X_names[nn_pred.kneighbors(track_vectors[0])[1][0]])
ans

In [ ]:
sound_file = ans[0]
y, sr = librosa.load(sound_file)
librosa.output.write_wav('./genres/tmp.wav', y, sr, norm=True)
Audio(url='./genres/tmp.wav')

In [ ]:
sound_file = ans[1]
y, sr = librosa.load(sound_file)
librosa.output.write_wav('./genres/tmp.wav', y, sr, norm=True)
Audio(url='./genres/tmp.wav')

Maps of tracks by svd and tsne


In [ ]:
from sklearn.manifold import TSNE

In [ ]:
represent = lasagne.layers.get_output(nn.input_layer)
represent_fun = theano.function([input_X], [represent], allow_input_downcast=True)

In [ ]:
f = lambda x: np.array(represent_fun([x])[0])
track_vectors = map(f, X_train) + map(f, X_valid)
track_vectors = np.concatenate(track_vectors, axis=0)

track_labels = np.array(list(y_train) + list(y_valid))

In [ ]:
X_tsne = <Make TSNE Features>

In [ ]:
plt.figure(figsize=(10,10), dpi=500)
colors = cm.hot(np.linspace(0, 1, len(id2gener)))

for idx, gener in id2gener.items():
    idx_ = np.where(track_labels == idx)
    pylab.scatter(X_tsne[:, 0][idx_], X_tsne[:, 1][idx_], c=colors[idx], cmap=cm.hot, label=gener,s=50)

pylab.legend(loc=0, ncol=5)