In [1]:
#################
# Preprocessing #
#################
# Scores by other composers from the Bach family have been removed beforehand. 
# Miscellaneous scores like mass pieces have also been removed; the assumption here is that
# since different interpretations of the same piece (e.g. Ave Maria, etc) exist, including
# theses pieces might hurt the prediction accuracy, here mostly based on chord progression. 
# (more exactly, a reduced version of the chord progression.)

# In shell, find and copy midi files to target data directory and convert to mxl:
'''
cd {TARGETDIR}
find {MIDIFILEDIR} \( -name "bach*.mid" -o -name "beethoven*.mid" -o -name "scarlatti*.mid" \) -type f -exec cp {} . \;
find . -type f -name "*.mid" -exec /Applications/MuseScore\ 2.app/Contents/MacOS/mscore {} --export-to {}.mxl \;
for f in *.mxl; do mv "$f" "${f%.mid.mxl}.mxl"; done
ls *.mxl > mxl_list.txt
'''


Out[1]:
'\ncd {TARGETDIR}\nfind {MIDIFILEDIR} \\( -name "bach*.mid" -o -name "beethoven*.mid" -o -name "scarlatti*.mid" \\) -type f -exec cp {} . \\;\nfind . -type f -name "*.mid" -exec /Applications/MuseScore\\ 2.app/Contents/MacOS/mscore {} --export-to {}.mxl \\;\nfor f in *.mxl; do mv "$f" "${f%.mid.mxl}.mxl"; done\nls *.mxl > mxl_list.txt\n'

In [2]:
from music21 import *
from os import listdir
from os.path import isfile, getsize

In [3]:
# timeout function that lets move on beyond too big files.
# by Thomas Ahle: http://stackoverflow.com/a/22348885
import signal

class timeout:
    def __init__(self, seconds=1, error_message='Timeout'):
        self.seconds = seconds
        self.error_message = error_message
    def handle_timeout(self, signum, frame):
        raise TimeoutError(self.error_message)
    def __enter__(self):
        signal.signal(signal.SIGALRM, self.handle_timeout)
        signal.alarm(self.seconds)
    def __exit__(self, type, value, traceback):
        signal.alarm(0)

In [4]:
def parse(mxllist, composer):
    composer_list = [f for f in mxllist if f.replace('-', '_').split('_')[0] == composer]
    for file in composer_list:
        if (getsize(file)>10000): # remove too short scores that may contain no notes
            with timeout(seconds=6000):
                try:
                    s = converter.parse(mxldir+file)
                    try:
                        k = s.flat.keySignature.sharps
                    except AttributeError:
                        k = s.analyze('key').sharps
                    except:
                        with open('{}-parsed.txt'.format(composer), 'a') as output_file:            
                            output_file.write('key could not by analyzed\n')
                        with open('{}-transposed.txt'.format(composer), 'a') as output_file:            
                            output_file.write('key could not by analyzed\n')
                        continue
                    t = s.transpose((k*5)%12)
                except:
                    with open('{}-parsed.txt'.format(composer), 'a') as output_file:
                        output_file.write('timeout\n')
                    with open('{}-transposed.txt'.format(composer), 'a') as output_file:            
                        output_file.write('timeout\n')
                    continue

            fp_s = converter.freeze(s, fmt='pickle')
            fp_t = converter.freeze(t, fmt='pickle')

            with open('{}-parsed.txt'.format(composer), 'a') as output_file:
                output_file.write(fp_s+'\n')
            with open('{}-transposed.txt'.format(composer), 'a') as output_file:            
                output_file.write(fp_t+'\n')

In [ ]:
with open('mxl_list.txt', 'r') as f:
    mxllist = [line.strip() for line in f.readlines()]

parse(mxllist, 'bach')
parse(mxllist, 'beethoven')
parse(mxllist, 'debussy')
parse(mxllist, 'scarlatti')
parse(mxllist, 'victoria')

In [5]:
######################
# Feature Extraction #
######################

In [6]:
import itertools
from collections import Counter

flatten = lambda l: [item for sublist in l for item in sublist] # by Alex Martinelli & Guillaume Jacquenot: http://stackoverflow.com/a/952952
uniqify = lambda seq: list(set(seq))

In [7]:
# Define known chords
major, minor, suspended, augmented, diminished, major_sixth, minor_sixth, dominant_seventh, major_seventh, minor_seventh, half_diminished_seventh, diminished_seventh, major_ninth, dominant_ninth, dominant_minor_ninth, minor_ninth = [0,4,7],[0,3,7],[0,5,7],[0,4,8],[0,3,6],[0,4,7,9],[0,3,7,9],[0,4,7,10],[0,4,7,11],[0,3,7,10],[0,3,6,10],[0,3,6,9],[0,2,4,7,11],[0,2,4,7,10],[0,1,4,7,10],[0,2,3,7,10]
chord_types_list = [major, minor, suspended, augmented, diminished, major_sixth, minor_sixth, dominant_seventh, major_seventh, minor_seventh, half_diminished_seventh, diminished_seventh, major_ninth, dominant_ninth, dominant_minor_ninth, minor_ninth]
chord_types_string = ['major', 'minor', 'suspended', 'augmented', 'diminished', 'major_sixth', 'minor_sixth', 'dominant_seventh', 'major_seventh', 'minor_seventh', 'half_diminished_seventh', 'diminished_seventh', 'major_ninth', 'dominant_ninth', 'dominant_minor_ninth', 'minor_ninth']

roots = list(range(12))
chord_orders = flatten([[{(n+r)%12 for n in v} for v in chord_types_list] for r in roots])
unique_orders = []
for i in range(192):
    if chord_orders[i] not in unique_orders:
        unique_orders.append(chord_orders[i])

In [8]:
def merge_chords(s):
    sf = s.flat
    chords_by_offset = []
    for i in range(int(sf.highestTime)):
        chords_by_offset.append(chord.Chord(sf.getElementsByOffset(i,i+1, includeEndBoundary=False, mustFinishInSpan=False, mustBeginInSpan=False).notes))
    return chords_by_offset

def find_neighbor_note(n, k):
    # find notes k steps away from n
    return (roots[n-6:]+roots[:(n+6)%12])[6+k], (roots[n-6:]+roots[:(n+6)%12])[6-k]

def find_note_distance(n1, n2):
    return abs(6 - (roots[n1-6:]+roots[:(n1+6)%12]).index(n2))

def find_chord_distance(set1, set2):
    d1, d2 = set1.difference(set2), set2.difference(set1)
    if len(d1) < len(d2):
        longer, shorter = d2, list(d1)
    else:
        longer, shorter = d1, list(d2)
    distances = []
    for combination in itertools.combinations(longer, len(shorter)):
        for permutation in itertools.permutations(combination):
            dist_p = abs(len(d1)-len(d2))*3 # length difference means notes need to be added/deleted. weighted by 3
            for i in range(len(shorter)):
                dist_p += find_note_distance(shorter[i], permutation[i])
            distances.append(dist_p)
    return min(distances)

In [9]:
CACHE = dict()

def find_closest_chord(c, cache=CACHE):
    if len(c) == 0:
        return -1 # use -1 for rest (chords are 0 to 191)
    
    # retrieve from existing knowledge
    o_str, o, p = str(c.normalOrder), set(c.normalOrder), c.pitchClasses
    if o in chord_orders:
        return chord_orders.index(o)
        # the above root sometimes differs from c.findRoot(), which might be more reliable.
        # however, the errors are rare and it should be good enough for now.
    if o_str in cache.keys():
        return cache[o_str]
    
    # find closest chord from scratch
    chord_distances = dict()
    most_common_note = Counter(c.pitchClasses).most_common(1)[0][0]

    for i in range(192):
        d = find_chord_distance(o, chord_orders[i])
        # prioritize found chord's root note if most common note of the chord.
        if int(i/16) == most_common_note:
            d += -1
        if chord_distances.get(d) == None:
            chord_distances[d] = []
        chord_distances[d].append(i)

    # if multiple chords are tied, use first one (could be better)
    closest_chord = chord_distances[min(chord_distances.keys())][0]
    
    cache[o_str] = closest_chord
    return closest_chord

In [10]:
def extract_features(parsed_list, idx):
    s = converter.thaw(parsed_list[idx])
    chords_by_offset = merge_chords(s)

    chord_sequence = []
    for i in range(len(chords_by_offset)):
        chord_sequence.append(find_closest_chord(chords_by_offset[i], CACHE))
    return chord_sequence

In [ ]:
with open('bach-parsed.txt', 'r') as f:
    FILES_BACH = [line.strip() for line in f.readlines()]
with open('beethoven-parsed.txt', 'r') as f:
    FILES_BEETHOVEN = [line.strip() for line in f.readlines()]
with open('debussy-parsed.txt', 'r') as f:
    FILES_DEBUSSY = [line.strip() for line in f.readlines()]
with open('scarlatti-parsed.txt', 'r') as f:
    FILES_SCARLATTI = [line.strip() for line in f.readlines()]
with open('victoria-parsed.txt', 'r') as f:
    FILES_VICTORIA = [line.strip() for line in f.readlines()]
    
for i in range(len(FILES_BACH)):
    with open('bach-chordsequence.txt', 'a') as f:
        f.write(str(extract_features(FILES_BACH, i))+'\n')
for i in range(len(FILES_BEETHOVEN)):
    with open('beethoven-chordsequence.txt', 'a') as f:
        f.write(str(extract_features(FILES_BEETHOVEN, i))+'\n')
for i in range(len(FILES_DEBUSSY)):
    with open('debussy-chordsequence.txt', 'a') as f:
        f.write(str(extract_features(FILES_DEBUSSY, i))+'\n')
for i in range(len(FILES_SCARLATTI)):
    with open('scarlatti-chordsequence.txt', 'a') as f:
        f.write(str(extract_features(FILES_SCARLATTI, i))+'\n')
for i in range(len(FILES_VICTORIA)):
    with open('victoria-chordsequence.txt', 'a') as f:
        f.write(str(extract_features(FILES_VICTORIA, i))+'\n')

In [ ]:
# Additional feature set: extract durations of notes, chords, and rests
def find_length_add_to_list(cnr, out_list):
    try:
        out_list.append(cnr.duration.fullName)
    except:
        out_list.append(str(cnr.duration.quarterLength))

def extract_cnr_duration(piece):
    s = converter.thaw(piece).flat
    chords, notes, rests = [], [], []
    for c in s.getElementsByClass(chord.Chord):
        find_length_add_to_list(c, chords)
    for n in s.getElementsByClass(note.Note):
        find_length_add_to_list(n, notes)
    for r in s.getElementsByClass(note.Rest):
        find_length_add_to_list(r, rests)
    elements = ['chord|'+d for d in chords] + ['note|'+d for d in notes] + ['rest|'+d for d in rests]
    return ';'.join(elements)

In [ ]:
for piece in FILES_BACH:
    with open('bach-durations.txt', 'a') as f:
        f.write(extract_cnr_duration(piece)+'\n')
for piece in FILES_BEETHOVEN:
    with open('beethoven-durations.txt', 'a') as f:
        f.write(extract_cnr_duration(piece)+'\n')
for piece in FILES_DEBUSSY:
    with open('debussy-durations.txt', 'a') as f:
        f.write(extract_cnr_duration(piece)+'\n')
for piece in FILES_SCARLATTI:
    with open('scarlatti-durations.txt', 'a') as f:
        f.write(extract_cnr_duration(piece)+'\n')
for piece in FILES_VICTORIA:
    with open('victoria-durations.txt', 'a') as f:
        f.write(extract_cnr_duration(piece)+'\n')