In [74]:
import pandas as pd
import os
import librosa
import minst.taxonomy
# Access the example notes using the notes_df
DATA_DIR = "./data/notes"
NOTES_CSV = os.path.join(DATA_DIR, "notes_index.csv")
notes_df = pd.read_csv(NOTES_CSV, index_col=0)
# Set the note_file path to the actual path
def note_fp(filename):
return os.path.join(DATA_DIR, filename)
notes_df["note_file"] = notes_df["note_file"].map(note_fp)
# Use the taxonomy to drop any instruments that we don't care about.
notes_df = minst.taxonomy.normalize_instrument_names(notes_df).dropna()
print("Data Summary")
print(notes_df["dataset"].value_counts())
print(notes_df["instrument"].value_counts())
In [76]:
def load_mfccs_from_audio(note_file):
y, sr = librosa.load(note_file)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T
# Arbitrarily averget over this now.
return mfcc.mean(axis=0)[np.newaxis, :]
# This is going to take a minute...
mfccs = [load_mfccs_from_audio(x) for x in notes_df["note_file"]]
# put it in the dataframe for easy grabbing later
notes_df['mfcc'] = mfccs
notes_df.iloc[0]['mfcc'].shape
Out[76]:
In [77]:
# Convert the labels
import numpy as np
import sklearn.cross_validation
import sklearn.preprocessing
enc = sklearn.preprocessing.LabelEncoder()
targets = enc.fit_transform(notes_df["instrument"])
notes_df['target'] = targets
notes_df['target']
# Make some nice SKLearn-style data
X = np.concatenate(notes_df["mfcc"])
y = np.concatenate(notes_df["target"].map(np.atleast_1d))
print(X.shape, y.shape)
# Split it into train and test
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
In [78]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(sklearn.metrics.classification_report(y_test, y_pred))
In [ ]: