In [ ]:
##################
# Classification #
##################

In [ ]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn import svm, linear_model, naive_bayes, neural_network, neighbors, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
import random, math
import numpy as np
import scipy.sparse as sp
from datetime import datetime
from collections import Counter
from itertools import combinations

In [ ]:
with open('mxl-list.txt', 'r') as f:
    dataset = [piece.strip() for piece in f.readlines()]
    bach_data = [f for f in dataset if f.replace('-', '_').split('_')[0] == 'bach']
    beethoven_data = [f for f in dataset if f.replace('-', '_').split('_')[0] == 'beethoven']
    debussy_data = [f for f in dataset if f.replace('-', '_').split('_')[0] == 'debussy']
    scarlatti_data = [f for f in dataset if f.replace('-', '_').split('_')[0] == 'scarlatti']
    victoria_data = [f for f in dataset if f.replace('-', '_').split('_')[0] == 'victoria']

In [ ]:
with open('bach-chordsequence.txt', 'r') as f:
    BACH = [' '.join(piece.strip('[]\n').split(', ')) for piece in f.readlines()]
    BACH = [(BACH[i], bach_data[i]) for i in range(len(BACH))]
with open('beethoven-chordsequence.txt', 'r') as f:
    BEETHOVEN = [' '.join(piece.strip('[]\n').split(', ')) for piece in f.readlines()]
    BEETHOVEN = [(BEETHOVEN[i], beethoven_data[i]) for i in range(len(BEETHOVEN))]
with open('debussy-chordsequence.txt', 'r') as f:
    DEBUSSY = [' '.join(piece.strip('[]\n').split(', ')) for piece in f.readlines()]
    DEBUSSY = [(DEBUSSY[i], debussy_data[i]) for i in range(len(DEBUSSY))]
with open('scarlatti-chordsequence.txt', 'r') as f:
    SCARLATTI = [' '.join(piece.strip('[]\n').split(', ')) for piece in f.readlines()]
    SCARLATTI = [(SCARLATTI[i], scarlatti_data[i]) for i in range(len(SCARLATTI))]
with open('victoria-chordsequence.txt', 'r') as f:
    VICTORIA = [' '.join(piece.strip('[]\n').split(', ')) for piece in f.readlines()]
    VICTORIA = [(VICTORIA[i], victoria_data[i]) for i in range(len(VICTORIA))]

In [ ]:
def find_ngrams(input_list, N=4):
    return [' '.join(input_list[i:i+N]) for i in range(len(input_list)-N+1)]

def ngrams_by_composer(composer): 
    for i in range(1,5):
        ngrams = []
        for piece in composer:
            ngrams += find_ngrams(piece[0].split(' '), i)
        print(len(ngrams), '{}-grams total;'.format(str(i)), len(set(ngrams)), 'unique')
    print('-')

def show_ngrams(composer_data, composer_name):
    print(composer_name, ':', len(composer_data), 'pieces')
    ngrams_by_composer(composer_data)

In [ ]:
show_ngrams(BACH,'bach')
show_ngrams(BEETHOVEN,'beethoven')
show_ngrams(DEBUSSY,'debussy')
show_ngrams(SCARLATTI,'scarlatti')
show_ngrams(VICTORIA, 'victoria')
show_ngrams(BACH+BEETHOVEN+DEBUSSY+SCARLATTI+VICTORIA, 'all composers')

In [ ]:
def build_Xy(composers, size=1):
    if size >= 1: # use every rows
        indices = [range(len(composer)) for composer in composers]
    else:
        indices = [random.sample(range(len(composer)), math.ceil(size*len(composer))) for composer in composers]

    y = []
    for i in range(len(composers)):
        y += [i for n in range(len(indices[i]))]
    
    X = []
    for i in range(len(composers)):
        X += [composers[i][j] for j in indices[i]]
    
    return X, np.array(y, dtype='int16')

In [ ]:
def crossvalidate(X_tuple, y, classifiers, vectorizer, NGRAMRANGE, K=10):    
    for clf in classifiers:
        clf.cm_sum = np.zeros([len(set(y)),len(set(y))], dtype='int16')
        clf.accuracies, clf.fones, clf.misclassified, clf.runningtime = [], [], [], []
        clf.fones_micro, clf.fones_macro = [], []
        clf.name = str(clf).split('(')[0]

    X = np.array([piece[0] for piece in X_tuple])
    filenames = np.array([piece[1] for piece in X_tuple])
    kf = KFold(n_splits=K, shuffle=True)
    for train_index, test_index in kf.split(y):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        
        vct = vectorizer.set_params(lowercase=False, token_pattern=u"(?u)\\b\\w+\\b", ngram_range=NGRAMRANGE)
        X_train_tfidf = vct.fit_transform(X_train)
#         X_test_tfidf = tfidf.transform(X_test)  # i think this computes tf-idf values using the whole test set, but i want each piece to be treated separately
        X_test_tfidf = sp.vstack([vct.transform(np.array([piece])) for piece in X_test])
        
        for clf in classifiers:
            t = datetime.now()
            clf.fit(X_train_tfidf, y_train)
            y_pred = clf.predict(X_test_tfidf)
            clf.runningtime.append((datetime.now()-t).total_seconds())
            clf.cm_sum += confusion_matrix(y_test, y_pred)
            clf.misclassified.append(test_index[np.where(y_test != y_pred)]) # http://stackoverflow.com/a/25570632
            clf.accuracies.append(accuracy_score(y_test, y_pred))
            clf.fones.append(f1_score(y_test, y_pred, average='weighted'))
            clf.fones_micro.append(f1_score(y_test, y_pred, average='micro'))
            clf.fones_macro.append(f1_score(y_test, y_pred, average='macro'))

    result = dict()
    for clf in classifiers:
        clf.misclassified = np.sort(np.hstack(clf.misclassified))
        result[clf.name] = [clf.cm_sum, clf.accuracies, clf.fones, clf.misclassified, filenames[clf.misclassified], clf.runningtime, clf.fones_micro, clf.fones_macro]
    return result

In [ ]:
def benchmark_classifiers(composers, NGRAMRANGES, classifiers, vectorizer, n=1, retrieve_title=True):
    misclassified_list = []
    for NGRAMRANGE in NGRAMRANGES:
        print('n-gram range', NGRAMRANGE)
        X, y = build_Xy(composers, size=n)
        cv_result = crossvalidate(X, y, classifiers, vectorizer, NGRAMRANGE)
        for clf, results in cv_result.items():
            print(clf)
            cm = results[0]
            print(cm)
            acc = results[1] # using two different f-measures, don't need this
#             print('accuracy', round(np.mean(acc)*100,2), '({})'.format(round(np.std(acc, ddof=1)*100,2)))
            fones = results[2] # weighted average, don't need this
#             print('f1', round(np.mean(fones)*100,2), '({})'.format(round(np.std(fones, ddof=1)*100,2)), fones)
            misclassified = results[3]
            misclassified_filenames = results[4]
            misclassified_list += list(misclassified_filenames)
#             print('misclassified',[(misclassified[i], misclassified_filenames[i]) for i in range(len(misclassified))])
            runningtime = results[5]
#             print('running time', np.sum(runningtime))
            fones_micro = results[6]
            fones_macro = results[7]
            print('micro-averaged f-score (std) & macro-averaged f-score (std)')
            print(round(np.mean(fones_micro),4), '({})'.format(round(np.std(fones_micro, ddof=1),4)), '&', round(np.mean(fones_macro),4), '({})'.format(round(np.std(fones_macro, ddof=1),4)))
    print('-----')
    return misclassified_list

In [ ]:
COMPOSERS = [BACH, BEETHOVEN, DEBUSSY, SCARLATTI, VICTORIA]
NGRAMRANGES = [(1,1),(2,2),(3,3),(4,4),(1,2),(3,4),(1,4)]
CLASSIFIERS = [svm.LinearSVC(penalty='l2', C=5, loss='hinge'),
               linear_model.LogisticRegression(penalty='l2', C=100, tol=1, multi_class='multinomial', solver='sag'),
               neighbors.KNeighborsClassifier(weights='distance'),
               naive_bayes.MultinomialNB(alpha=0.00001, fit_prior=False),
               neural_network.MLPClassifier(solver='lbfgs',hidden_layer_sizes=(10,))]

In [ ]:
# Compare different methods of vectorizing

In [ ]:
VECTORIZER = TfidfVectorizer(sublinear_tf=True)
benchmark_classifiers(COMPOSERS,NGRAMRANGES,CLASSIFIERS,VECTORIZER)

In [ ]:
VECTORIZER = CountVectorizer(binary=True)
benchmark_classifiers(COMPOSERS,NGRAMRANGES,CLASSIFIERS,VECTORIZER)

In [ ]:
VECTORIZER = CountVectorizer()
benchmark_classifiers(COMPOSERS,NGRAMRANGES,CLASSIFIERS,VECTORIZER)

In [ ]:
# test pairwise classification

In [ ]:
NGRAMRANGES = [(1,2)]
VECTORIZER = TfidfVectorizer(sublinear_tf=True)
for indices in combinations(range(5),2):
    print('composer indices',[i for i in indices]) # COMPOSERS = [BACH, BEETHOVEN, DEBUSSY, SCARLATTI, VICTORIA]
    benchmark_classifiers([COMPOSERS[i] for i in indices],NGRAMRANGES,CLASSIFIERS,VECTORIZER)

In [ ]:
# Identify the often-misclassified files
# do the experiment 100 times with the best classifier, SVM
# then find pieces that are misclassified more than 50% of the time
COMPOSERS = [BACH, BEETHOVEN, DEBUSSY, SCARLATTI, VICTORIA]
CLASSIFIERS = [svm.LinearSVC(penalty='l2', C=5, loss='hinge')] # 
appendix = []
for i in range(100):
    appendix += benchmark_classifiers(COMPOSERS,NGRAMRANGES,CLASSIFIERS,VECTORIZER)
Counter(appendix).most_common()

In [ ]:
# Use both chord sequences and duration data

In [ ]:
flatten = lambda l: [item for sublist in l for item in sublist]
with open('bach-durations.txt', 'r') as f:
    BD2 = [line.strip() for line in f.readlines()]
with open('beethoven-durations.txt', 'r') as f:
    BD = [line.strip() for line in f.readlines()]
with open('debussy-durations.txt', 'r') as f:
    DD = [line.strip() for line in f.readlines()]
with open('scarlatti-durations.txt', 'r') as f:
    SD = [line.strip() for line in f.readlines()]
with open('victoria-durations.txt', 'r') as f:
    VD = [line.strip() for line in f.readlines()]
    
BD2_TYPELENGTH = [piece.split(';') for piece in BD2]
BD_TYPELENGTH = [piece.split(';') for piece in BD]
DD_TYPELENGTH = [piece.split(';') for piece in DD]
SD_TYPELENGTH = [piece.split(';') for piece in SD]
VD_TYPELENGTH = [piece.split(';') for piece in VD]

typelengths = list(set(flatten(BD2_TYPELENGTH+BD_TYPELENGTH+DD_TYPELENGTH+SD_TYPELENGTH+VD_TYPELENGTH)))
typelength_dict = {typelengths[i]: str(i+300) for i in range(len(typelengths))}
BD2_T = [(' '.join([typelength_dict[dur] for dur in piece]),'temp') for piece in BD2_TYPELENGTH]
BD_T = [(' '.join([typelength_dict[dur] for dur in piece]),'temp') for piece in BD_TYPELENGTH]
DD_T = [(' '.join([typelength_dict[dur] for dur in piece]),'temp') for piece in DD_TYPELENGTH]
SD_T = [(' '.join([typelength_dict[dur] for dur in piece]),'temp') for piece in SD_TYPELENGTH]
VD_T = [(' '.join([typelength_dict[dur] for dur in piece]),'temp') for piece in VD_TYPELENGTH]

In [ ]:
# print most common duration by composer, regardless of element type(chord/note/rest)

# BD_LENGTHONLY = [[string.split('|')[1] for string in piece.split(';')] for piece in BD]
# SD_LENGTHONLY = [[string.split('|')[1] for string in piece.split(';')] for piece in SD]
# BD2_LENGTHONLY = [[string.split('|')[1] for string in piece.split(';')] for piece in BD2]
# DD_LENGTHONLY = [[string.split('|')[1] for string in piece.split(';')] for piece in DD]
# VD_LENGTHONLY = [[string.split('|')[1] for string in piece.split(';')] for piece in VD]
# lengths = list(set(flatten(BD2_LENGTHONLY+BD_LENGTHONLY+DD_LENGTHONLY+SD_LENGTHONLY+VD_LENGTHONLY)))
# length_dict = {lengths[i]: str(i+200) for i in range(len(lengths))}
# BD_L = [(' '.join([length_dict[dur] for dur in piece]),'temp') for piece in BD_LENGTHONLY]
# BD2_L = [(' '.join([length_dict[dur] for dur in piece]),'temp') for piece in BD2_LENGTHONLY]
# SD_L = [(' '.join([length_dict[dur] for dur in piece]),'temp') for piece in SD_LENGTHONLY]
# DD_L = [(' '.join([length_dict[dur] for dur in piece]),'temp') for piece in DD_LENGTHONLY]
# VD_L = [(' '.join([length_dict[dur] for dur in piece]),'temp') for piece in VD_LENGTHONLY]
# duration_all       = flatten(BD2_LENGTHONLY+BD_LENGTHONLY+DD_LENGTHONLY+SD_LENGTHONLY+VD_LENGTHONLY)
# duration_bach      = flatten(BD2_LENGTHONLY)
# duration_beethoven = flatten(BD_LENGTHONLY)
# duration_debussy   = flatten(DD_LENGTHONLY)
# duration_scarlatti = flatten(SD_LENGTHONLY)
# duration_victoria  = flatten(VD_LENGTHONLY)

# for l in [duration_all,duration_bach,duration_beethoven,duration_debussy,duration_scarlatti,duration_victoria]:
#     for key, value in Counter(l).most_common(10):
#         print(key, '&', round(100*value/len(l),2))
#     print('')

In [ ]:
def crossvalidate_twofeaturesets(X_tuple1, X_tuple2, y, classifiers, vectorizer, range1, range2, K=10):    
    for clf in classifiers:
        clf.cm_sum = np.zeros([len(set(y)),len(set(y))], dtype='int16')
        clf.accuracies, clf.fones, clf.misclassified, clf.runningtime = [], [], [], []
        clf.fones_micro, clf.fones_macro = [], []
        clf.name = str(clf).split('(')[0]

    X1 = np.array([piece[0] for piece in X_tuple1])
    X2 = np.array([piece[0] for piece in X_tuple2])
    filenames = np.array([piece[1] for piece in X_tuple2])
    kf = KFold(n_splits=K, shuffle=True)
    for train_index, test_index in kf.split(y):
        y_train, y_test = y[train_index], y[test_index]
        X_train_new, X_test_new = X1[train_index], X1[test_index]
        vct1 = vectorizer.set_params(ngram_range=range1)
        X_train, X_test = X2[train_index], X2[test_index] 
        vct2 = vectorizer.set_params(ngram_range=range2)
   
        X_train_new_tfidf = vct1.fit_transform(X_train_new) # use two separate vectorizers for each feature set
        X_test_new_tfidf = sp.vstack([vct1.transform(np.array([piece])) for piece in X_test_new])
        X_train_tfidf = vct2.fit_transform(X_train)
        X_test_tfidf = sp.vstack([vct2.transform(np.array([piece])) for piece in X_test])
        
        X_train_tfidf = sp.hstack((X_train_tfidf, X_train_new_tfidf)) # Merge the two feature sets
        X_test_tfidf = sp.hstack((X_test_tfidf, X_test_new_tfidf))
        
        for clf in classifiers:
            t = datetime.now()
            clf.fit(X_train_tfidf, y_train)
            y_pred = clf.predict(X_test_tfidf)
            clf.runningtime.append((datetime.now()-t).total_seconds())
            clf.cm_sum += confusion_matrix(y_test, y_pred)
            clf.misclassified.append(test_index[np.where(y_test != y_pred)]) # http://stackoverflow.com/a/25570632
            clf.accuracies.append(accuracy_score(y_test, y_pred))
            clf.fones.append(f1_score(y_test, y_pred, average='weighted'))
            clf.fones_micro.append(f1_score(y_test, y_pred, average='micro'))
            clf.fones_macro.append(f1_score(y_test, y_pred, average='macro'))

    result = dict()
    for clf in classifiers:
        clf.misclassified = np.sort(np.hstack(clf.misclassified))
        result[clf.name] = [clf.cm_sum, clf.accuracies, clf.fones, clf.misclassified, filenames[clf.misclassified], clf.runningtime, clf.fones_micro, clf.fones_macro]
    return result

In [ ]:
def benchmark_classifiers_twofeaturesets(composers1, composers2, range1, range2, classifiers, vectorizer, n=1, retrieve_title=True):
    misclassified_list = []
    print('duration n-gram range:', range1, 'chord n-gram range:', range2)
    X1, y = build_Xy(composers1, size=n)
    X2, y = build_Xy(composers2, size=n)
    cv_result = crossvalidate_twofeaturesets(X1, X2, y, classifiers, vectorizer, range1, range2)
    for clf, results in cv_result.items():
        print(clf)
        cm = results[0]
        print(cm)
        acc = results[1]
        fones = results[2]
        misclassified = results[3]
        misclassified_filenames = results[4]
        misclassified_list += list(misclassified_filenames)
#             print('misclassified',[(misclassified[i], misclassified_filenames[i]) for i in range(len(misclassified))])
        runningtime = results[5]
#         print('running time', np.sum(runningtime))
        fones_micro = results[6]
        fones_macro = results[7]
        print('F-measures')
        print(round(np.mean(fones_micro),4), '({})'.format(round(np.std(fones_micro, ddof=1),4)), '&', round(np.mean(fones_macro),4), '({})'.format(round(np.std(fones_macro, ddof=1),4)))
    print('-----')
    return misclassified_list

In [ ]:
CLASSIFIERS = [svm.LinearSVC(penalty='l2', C=5, loss='hinge'),
               linear_model.LogisticRegression(penalty='l2', C=100, tol=1, multi_class='multinomial', solver='sag'),
               neighbors.KNeighborsClassifier(weights='distance'),
               naive_bayes.MultinomialNB(alpha=0.00001, fit_prior=False),
               neural_network.MLPClassifier(solver='lbfgs',hidden_layer_sizes=(10,))]
VECTORIZER = TfidfVectorizer(sublinear_tf=True, lowercase=False, token_pattern=u"(?u)\\b\\w+\\b")

COMPOSERS1 = [BD2_T,BD_T,DD_T,SD_T,VD_T]
COMPOSERS2 = [BACH, BEETHOVEN, DEBUSSY, SCARLATTI, VICTORIA]

In [ ]:
benchmark_classifiers_twofeaturesets(COMPOSERS1, COMPOSERS2, (1,1), (1,2), CLASSIFIERS, VECTORIZER)

In [ ]:
for indices in combinations(range(5),2):
    print('composer indices',[i for i in indices]) 
    benchmark_classifiers_twofeaturesets([COMPOSERS1[i] for i in indices],[COMPOSERS2[i] for i in indices],(1,1),(1,2),CLASSIFIERS,VECTORIZER)

In [ ]:
# Identify the often-misclassified files, using both feature sets
# do the experiment 100 times with the best classifier, SVM
# then find pieces that are misclassified more than 50% of the time
COMPOSERS1 = [BD2_T,BD_T,DD_T,SD_T,VD_T]
COMPOSERS2 = [BACH, BEETHOVEN, DEBUSSY, SCARLATTI, VICTORIA]
CLASSIFIERS = [svm.LinearSVC(penalty='l2', C=5, loss='hinge')] # 
appendix = []
for i in range(100):
    appendix += benchmark_classifiers_twofeaturesets(COMPOSERS1, COMPOSERS2, (1,1), (1,2), CLASSIFIERS, VECTORIZER)
Counter(appendix).most_common()

In [ ]: