In [ ]:
##################
# Classification #
##################
In [ ]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn import svm, linear_model, naive_bayes, neural_network, neighbors, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
import random, math
import numpy as np
import scipy.sparse as sp
from datetime import datetime
from collections import Counter
from itertools import combinations
In [ ]:
with open('mxl-list.txt', 'r') as f:
dataset = [piece.strip() for piece in f.readlines()]
bach_data = [f for f in dataset if f.replace('-', '_').split('_')[0] == 'bach']
beethoven_data = [f for f in dataset if f.replace('-', '_').split('_')[0] == 'beethoven']
debussy_data = [f for f in dataset if f.replace('-', '_').split('_')[0] == 'debussy']
scarlatti_data = [f for f in dataset if f.replace('-', '_').split('_')[0] == 'scarlatti']
victoria_data = [f for f in dataset if f.replace('-', '_').split('_')[0] == 'victoria']
In [ ]:
with open('bach-chordsequence.txt', 'r') as f:
BACH = [' '.join(piece.strip('[]\n').split(', ')) for piece in f.readlines()]
BACH = [(BACH[i], bach_data[i]) for i in range(len(BACH))]
with open('beethoven-chordsequence.txt', 'r') as f:
BEETHOVEN = [' '.join(piece.strip('[]\n').split(', ')) for piece in f.readlines()]
BEETHOVEN = [(BEETHOVEN[i], beethoven_data[i]) for i in range(len(BEETHOVEN))]
with open('debussy-chordsequence.txt', 'r') as f:
DEBUSSY = [' '.join(piece.strip('[]\n').split(', ')) for piece in f.readlines()]
DEBUSSY = [(DEBUSSY[i], debussy_data[i]) for i in range(len(DEBUSSY))]
with open('scarlatti-chordsequence.txt', 'r') as f:
SCARLATTI = [' '.join(piece.strip('[]\n').split(', ')) for piece in f.readlines()]
SCARLATTI = [(SCARLATTI[i], scarlatti_data[i]) for i in range(len(SCARLATTI))]
with open('victoria-chordsequence.txt', 'r') as f:
VICTORIA = [' '.join(piece.strip('[]\n').split(', ')) for piece in f.readlines()]
VICTORIA = [(VICTORIA[i], victoria_data[i]) for i in range(len(VICTORIA))]
In [ ]:
def find_ngrams(input_list, N=4):
return [' '.join(input_list[i:i+N]) for i in range(len(input_list)-N+1)]
def ngrams_by_composer(composer):
for i in range(1,5):
ngrams = []
for piece in composer:
ngrams += find_ngrams(piece[0].split(' '), i)
print(len(ngrams), '{}-grams total;'.format(str(i)), len(set(ngrams)), 'unique')
print('-')
def show_ngrams(composer_data, composer_name):
print(composer_name, ':', len(composer_data), 'pieces')
ngrams_by_composer(composer_data)
In [ ]:
show_ngrams(BACH,'bach')
show_ngrams(BEETHOVEN,'beethoven')
show_ngrams(DEBUSSY,'debussy')
show_ngrams(SCARLATTI,'scarlatti')
show_ngrams(VICTORIA, 'victoria')
show_ngrams(BACH+BEETHOVEN+DEBUSSY+SCARLATTI+VICTORIA, 'all composers')
In [ ]:
def build_Xy(composers, size=1):
if size >= 1: # use every rows
indices = [range(len(composer)) for composer in composers]
else:
indices = [random.sample(range(len(composer)), math.ceil(size*len(composer))) for composer in composers]
y = []
for i in range(len(composers)):
y += [i for n in range(len(indices[i]))]
X = []
for i in range(len(composers)):
X += [composers[i][j] for j in indices[i]]
return X, np.array(y, dtype='int16')
In [ ]:
def crossvalidate(X_tuple, y, classifiers, vectorizer, NGRAMRANGE, K=10):
for clf in classifiers:
clf.cm_sum = np.zeros([len(set(y)),len(set(y))], dtype='int16')
clf.accuracies, clf.fones, clf.misclassified, clf.runningtime = [], [], [], []
clf.fones_micro, clf.fones_macro = [], []
clf.name = str(clf).split('(')[0]
X = np.array([piece[0] for piece in X_tuple])
filenames = np.array([piece[1] for piece in X_tuple])
kf = KFold(n_splits=K, shuffle=True)
for train_index, test_index in kf.split(y):
X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
vct = vectorizer.set_params(lowercase=False, token_pattern=u"(?u)\\b\\w+\\b", ngram_range=NGRAMRANGE)
X_train_tfidf = vct.fit_transform(X_train)
# X_test_tfidf = tfidf.transform(X_test) # i think this computes tf-idf values using the whole test set, but i want each piece to be treated separately
X_test_tfidf = sp.vstack([vct.transform(np.array([piece])) for piece in X_test])
for clf in classifiers:
t = datetime.now()
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
clf.runningtime.append((datetime.now()-t).total_seconds())
clf.cm_sum += confusion_matrix(y_test, y_pred)
clf.misclassified.append(test_index[np.where(y_test != y_pred)]) # http://stackoverflow.com/a/25570632
clf.accuracies.append(accuracy_score(y_test, y_pred))
clf.fones.append(f1_score(y_test, y_pred, average='weighted'))
clf.fones_micro.append(f1_score(y_test, y_pred, average='micro'))
clf.fones_macro.append(f1_score(y_test, y_pred, average='macro'))
result = dict()
for clf in classifiers:
clf.misclassified = np.sort(np.hstack(clf.misclassified))
result[clf.name] = [clf.cm_sum, clf.accuracies, clf.fones, clf.misclassified, filenames[clf.misclassified], clf.runningtime, clf.fones_micro, clf.fones_macro]
return result
In [ ]:
def benchmark_classifiers(composers, NGRAMRANGES, classifiers, vectorizer, n=1, retrieve_title=True):
misclassified_list = []
for NGRAMRANGE in NGRAMRANGES:
print('n-gram range', NGRAMRANGE)
X, y = build_Xy(composers, size=n)
cv_result = crossvalidate(X, y, classifiers, vectorizer, NGRAMRANGE)
for clf, results in cv_result.items():
print(clf)
cm = results[0]
print(cm)
acc = results[1] # using two different f-measures, don't need this
# print('accuracy', round(np.mean(acc)*100,2), '({})'.format(round(np.std(acc, ddof=1)*100,2)))
fones = results[2] # weighted average, don't need this
# print('f1', round(np.mean(fones)*100,2), '({})'.format(round(np.std(fones, ddof=1)*100,2)), fones)
misclassified = results[3]
misclassified_filenames = results[4]
misclassified_list += list(misclassified_filenames)
# print('misclassified',[(misclassified[i], misclassified_filenames[i]) for i in range(len(misclassified))])
runningtime = results[5]
# print('running time', np.sum(runningtime))
fones_micro = results[6]
fones_macro = results[7]
print('micro-averaged f-score (std) & macro-averaged f-score (std)')
print(round(np.mean(fones_micro),4), '({})'.format(round(np.std(fones_micro, ddof=1),4)), '&', round(np.mean(fones_macro),4), '({})'.format(round(np.std(fones_macro, ddof=1),4)))
print('-----')
return misclassified_list
In [ ]:
COMPOSERS = [BACH, BEETHOVEN, DEBUSSY, SCARLATTI, VICTORIA]
NGRAMRANGES = [(1,1),(2,2),(3,3),(4,4),(1,2),(3,4),(1,4)]
CLASSIFIERS = [svm.LinearSVC(penalty='l2', C=5, loss='hinge'),
linear_model.LogisticRegression(penalty='l2', C=100, tol=1, multi_class='multinomial', solver='sag'),
neighbors.KNeighborsClassifier(weights='distance'),
naive_bayes.MultinomialNB(alpha=0.00001, fit_prior=False),
neural_network.MLPClassifier(solver='lbfgs',hidden_layer_sizes=(10,))]
In [ ]:
# Compare different methods of vectorizing
In [ ]:
VECTORIZER = TfidfVectorizer(sublinear_tf=True)
benchmark_classifiers(COMPOSERS,NGRAMRANGES,CLASSIFIERS,VECTORIZER)
In [ ]:
VECTORIZER = CountVectorizer(binary=True)
benchmark_classifiers(COMPOSERS,NGRAMRANGES,CLASSIFIERS,VECTORIZER)
In [ ]:
VECTORIZER = CountVectorizer()
benchmark_classifiers(COMPOSERS,NGRAMRANGES,CLASSIFIERS,VECTORIZER)
In [ ]:
# test pairwise classification
In [ ]:
NGRAMRANGES = [(1,2)]
VECTORIZER = TfidfVectorizer(sublinear_tf=True)
for indices in combinations(range(5),2):
print('composer indices',[i for i in indices]) # COMPOSERS = [BACH, BEETHOVEN, DEBUSSY, SCARLATTI, VICTORIA]
benchmark_classifiers([COMPOSERS[i] for i in indices],NGRAMRANGES,CLASSIFIERS,VECTORIZER)
In [ ]:
# Identify the often-misclassified files
# do the experiment 100 times with the best classifier, SVM
# then find pieces that are misclassified more than 50% of the time
COMPOSERS = [BACH, BEETHOVEN, DEBUSSY, SCARLATTI, VICTORIA]
CLASSIFIERS = [svm.LinearSVC(penalty='l2', C=5, loss='hinge')] #
appendix = []
for i in range(100):
appendix += benchmark_classifiers(COMPOSERS,NGRAMRANGES,CLASSIFIERS,VECTORIZER)
Counter(appendix).most_common()
In [ ]:
# Use both chord sequences and duration data
In [ ]:
flatten = lambda l: [item for sublist in l for item in sublist]
with open('bach-durations.txt', 'r') as f:
BD2 = [line.strip() for line in f.readlines()]
with open('beethoven-durations.txt', 'r') as f:
BD = [line.strip() for line in f.readlines()]
with open('debussy-durations.txt', 'r') as f:
DD = [line.strip() for line in f.readlines()]
with open('scarlatti-durations.txt', 'r') as f:
SD = [line.strip() for line in f.readlines()]
with open('victoria-durations.txt', 'r') as f:
VD = [line.strip() for line in f.readlines()]
BD2_TYPELENGTH = [piece.split(';') for piece in BD2]
BD_TYPELENGTH = [piece.split(';') for piece in BD]
DD_TYPELENGTH = [piece.split(';') for piece in DD]
SD_TYPELENGTH = [piece.split(';') for piece in SD]
VD_TYPELENGTH = [piece.split(';') for piece in VD]
typelengths = list(set(flatten(BD2_TYPELENGTH+BD_TYPELENGTH+DD_TYPELENGTH+SD_TYPELENGTH+VD_TYPELENGTH)))
typelength_dict = {typelengths[i]: str(i+300) for i in range(len(typelengths))}
BD2_T = [(' '.join([typelength_dict[dur] for dur in piece]),'temp') for piece in BD2_TYPELENGTH]
BD_T = [(' '.join([typelength_dict[dur] for dur in piece]),'temp') for piece in BD_TYPELENGTH]
DD_T = [(' '.join([typelength_dict[dur] for dur in piece]),'temp') for piece in DD_TYPELENGTH]
SD_T = [(' '.join([typelength_dict[dur] for dur in piece]),'temp') for piece in SD_TYPELENGTH]
VD_T = [(' '.join([typelength_dict[dur] for dur in piece]),'temp') for piece in VD_TYPELENGTH]
In [ ]:
# print most common duration by composer, regardless of element type(chord/note/rest)
# BD_LENGTHONLY = [[string.split('|')[1] for string in piece.split(';')] for piece in BD]
# SD_LENGTHONLY = [[string.split('|')[1] for string in piece.split(';')] for piece in SD]
# BD2_LENGTHONLY = [[string.split('|')[1] for string in piece.split(';')] for piece in BD2]
# DD_LENGTHONLY = [[string.split('|')[1] for string in piece.split(';')] for piece in DD]
# VD_LENGTHONLY = [[string.split('|')[1] for string in piece.split(';')] for piece in VD]
# lengths = list(set(flatten(BD2_LENGTHONLY+BD_LENGTHONLY+DD_LENGTHONLY+SD_LENGTHONLY+VD_LENGTHONLY)))
# length_dict = {lengths[i]: str(i+200) for i in range(len(lengths))}
# BD_L = [(' '.join([length_dict[dur] for dur in piece]),'temp') for piece in BD_LENGTHONLY]
# BD2_L = [(' '.join([length_dict[dur] for dur in piece]),'temp') for piece in BD2_LENGTHONLY]
# SD_L = [(' '.join([length_dict[dur] for dur in piece]),'temp') for piece in SD_LENGTHONLY]
# DD_L = [(' '.join([length_dict[dur] for dur in piece]),'temp') for piece in DD_LENGTHONLY]
# VD_L = [(' '.join([length_dict[dur] for dur in piece]),'temp') for piece in VD_LENGTHONLY]
# duration_all = flatten(BD2_LENGTHONLY+BD_LENGTHONLY+DD_LENGTHONLY+SD_LENGTHONLY+VD_LENGTHONLY)
# duration_bach = flatten(BD2_LENGTHONLY)
# duration_beethoven = flatten(BD_LENGTHONLY)
# duration_debussy = flatten(DD_LENGTHONLY)
# duration_scarlatti = flatten(SD_LENGTHONLY)
# duration_victoria = flatten(VD_LENGTHONLY)
# for l in [duration_all,duration_bach,duration_beethoven,duration_debussy,duration_scarlatti,duration_victoria]:
# for key, value in Counter(l).most_common(10):
# print(key, '&', round(100*value/len(l),2))
# print('')
In [ ]:
def crossvalidate_twofeaturesets(X_tuple1, X_tuple2, y, classifiers, vectorizer, range1, range2, K=10):
for clf in classifiers:
clf.cm_sum = np.zeros([len(set(y)),len(set(y))], dtype='int16')
clf.accuracies, clf.fones, clf.misclassified, clf.runningtime = [], [], [], []
clf.fones_micro, clf.fones_macro = [], []
clf.name = str(clf).split('(')[0]
X1 = np.array([piece[0] for piece in X_tuple1])
X2 = np.array([piece[0] for piece in X_tuple2])
filenames = np.array([piece[1] for piece in X_tuple2])
kf = KFold(n_splits=K, shuffle=True)
for train_index, test_index in kf.split(y):
y_train, y_test = y[train_index], y[test_index]
X_train_new, X_test_new = X1[train_index], X1[test_index]
vct1 = vectorizer.set_params(ngram_range=range1)
X_train, X_test = X2[train_index], X2[test_index]
vct2 = vectorizer.set_params(ngram_range=range2)
X_train_new_tfidf = vct1.fit_transform(X_train_new) # use two separate vectorizers for each feature set
X_test_new_tfidf = sp.vstack([vct1.transform(np.array([piece])) for piece in X_test_new])
X_train_tfidf = vct2.fit_transform(X_train)
X_test_tfidf = sp.vstack([vct2.transform(np.array([piece])) for piece in X_test])
X_train_tfidf = sp.hstack((X_train_tfidf, X_train_new_tfidf)) # Merge the two feature sets
X_test_tfidf = sp.hstack((X_test_tfidf, X_test_new_tfidf))
for clf in classifiers:
t = datetime.now()
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
clf.runningtime.append((datetime.now()-t).total_seconds())
clf.cm_sum += confusion_matrix(y_test, y_pred)
clf.misclassified.append(test_index[np.where(y_test != y_pred)]) # http://stackoverflow.com/a/25570632
clf.accuracies.append(accuracy_score(y_test, y_pred))
clf.fones.append(f1_score(y_test, y_pred, average='weighted'))
clf.fones_micro.append(f1_score(y_test, y_pred, average='micro'))
clf.fones_macro.append(f1_score(y_test, y_pred, average='macro'))
result = dict()
for clf in classifiers:
clf.misclassified = np.sort(np.hstack(clf.misclassified))
result[clf.name] = [clf.cm_sum, clf.accuracies, clf.fones, clf.misclassified, filenames[clf.misclassified], clf.runningtime, clf.fones_micro, clf.fones_macro]
return result
In [ ]:
def benchmark_classifiers_twofeaturesets(composers1, composers2, range1, range2, classifiers, vectorizer, n=1, retrieve_title=True):
misclassified_list = []
print('duration n-gram range:', range1, 'chord n-gram range:', range2)
X1, y = build_Xy(composers1, size=n)
X2, y = build_Xy(composers2, size=n)
cv_result = crossvalidate_twofeaturesets(X1, X2, y, classifiers, vectorizer, range1, range2)
for clf, results in cv_result.items():
print(clf)
cm = results[0]
print(cm)
acc = results[1]
fones = results[2]
misclassified = results[3]
misclassified_filenames = results[4]
misclassified_list += list(misclassified_filenames)
# print('misclassified',[(misclassified[i], misclassified_filenames[i]) for i in range(len(misclassified))])
runningtime = results[5]
# print('running time', np.sum(runningtime))
fones_micro = results[6]
fones_macro = results[7]
print('F-measures')
print(round(np.mean(fones_micro),4), '({})'.format(round(np.std(fones_micro, ddof=1),4)), '&', round(np.mean(fones_macro),4), '({})'.format(round(np.std(fones_macro, ddof=1),4)))
print('-----')
return misclassified_list
In [ ]:
CLASSIFIERS = [svm.LinearSVC(penalty='l2', C=5, loss='hinge'),
linear_model.LogisticRegression(penalty='l2', C=100, tol=1, multi_class='multinomial', solver='sag'),
neighbors.KNeighborsClassifier(weights='distance'),
naive_bayes.MultinomialNB(alpha=0.00001, fit_prior=False),
neural_network.MLPClassifier(solver='lbfgs',hidden_layer_sizes=(10,))]
VECTORIZER = TfidfVectorizer(sublinear_tf=True, lowercase=False, token_pattern=u"(?u)\\b\\w+\\b")
COMPOSERS1 = [BD2_T,BD_T,DD_T,SD_T,VD_T]
COMPOSERS2 = [BACH, BEETHOVEN, DEBUSSY, SCARLATTI, VICTORIA]
In [ ]:
benchmark_classifiers_twofeaturesets(COMPOSERS1, COMPOSERS2, (1,1), (1,2), CLASSIFIERS, VECTORIZER)
In [ ]:
for indices in combinations(range(5),2):
print('composer indices',[i for i in indices])
benchmark_classifiers_twofeaturesets([COMPOSERS1[i] for i in indices],[COMPOSERS2[i] for i in indices],(1,1),(1,2),CLASSIFIERS,VECTORIZER)
In [ ]:
# Identify the often-misclassified files, using both feature sets
# do the experiment 100 times with the best classifier, SVM
# then find pieces that are misclassified more than 50% of the time
COMPOSERS1 = [BD2_T,BD_T,DD_T,SD_T,VD_T]
COMPOSERS2 = [BACH, BEETHOVEN, DEBUSSY, SCARLATTI, VICTORIA]
CLASSIFIERS = [svm.LinearSVC(penalty='l2', C=5, loss='hinge')] #
appendix = []
for i in range(100):
appendix += benchmark_classifiers_twofeaturesets(COMPOSERS1, COMPOSERS2, (1,1), (1,2), CLASSIFIERS, VECTORIZER)
Counter(appendix).most_common()
In [ ]: