In [1]:
import re
import glob

# remove octaves (comma, capitalization), dashes, apostrophes, left round bracket
def clean_abc(text):
    text = text.lower().strip()
    text = re.sub("[,\-\(']", '', text)
    text = text.replace("\\", '')
    text = re.sub('x:1', '', text)
    text = re.sub('k:d', '', text)
    text = re.sub('[ |]', '', text)
    return text

In [2]:
text = ""
for line in open('input.txt'):
    text = text + "" + line.rstrip()
text = clean_abc(text)
melodies = text.split('%')[1:]

In [3]:
abc_dict = {}
for filename in glob.glob('*.abc'):
    text = ""
    for line in open(filename):
        text = text + "" + line.rstrip()    
    text = clean_abc(text)
    abc_dict[filename] = text
print len(abc_dict)


1

In [41]:
def isnote(string):
    return string.isalpha()
    
def good_start(string):
    return not(string.isdigit()) and not(string == '/')

# Add all melodies of a given length into a given dictionary
def hash_melodies(string, length):
    if len(string) < length:
        return    
    
    substrings = []

    
    cumulative = {}
    start = {}
    found = 0    
    # For each character index
    for index in xrange(0, len(string)):        
        # Grab the character
        character = string[index]
        
        # If it's a note
        if isnote(character):
            # Add a starting index, since notes can start
            start[index] = found
            
            # Increment found and record where it was found
            found = found + 1
            cumulative[found] = index
            
        # Otherwise, check if its a valid start and add if so
        elif good_start(character):
            start[index] = found
                
    # For each start position, find how far you need to go (if possible) to find a cumulative gain of length
    # Note that if a start is a note itself, it won't be counted due to ordering above
    for start_position in start:
        if start[start_position] + length in cumulative:
            substrings.append(string[start_position:cumulative[start[start_position] + length]+1])
            
    return set(substrings)

In [42]:
song_substrings = []
heuristic_length = 10
for melody in melodies:
    song_substrings.append(hash_melodies(melody, heuristic_length))
    
my_song_substrings = None
for song in abc_dict:
    my_song_substrings = hash_melodies(abc_dict[song], heuristic_length)
print my_song_substrings


set(['m:6/8d2dd2dd3d2de2e', ':6/8d2dd2dd3d2de2ee', 'a2fd3d2f/2g/2abaf', 'agffdfg2fed', 'ee2dc2de2ad2ef', 'd3d2a/2g/2fadf2ed', 'fgabagffdf', 'fd3d2f/2g/2abafd', 'abbagfgaag', 'e3efga2fg2ef2g', 'df2ed2aadfe3e', 'd2dd3d2de2ee2dc', 'gfgaagfgfg', 'cd2dd2dd3d2de2e', 'ba2fd3d2f/2g/2aba', 'dfabagabfg', 'd2ef2ff3fede2b', 'adf2ed2aadfe', 'agfgfge2gfg', 'bagfededcd', 'ga3fddadfag', 'dfg2fedefdb', 'fede2ba2fd3d2f', 'gfgaagfgab', 'fga2fg2ef2ga3f', 'afdfabagab', 'fg2ef2ga3fdda', 'f2ff3fede2ba2f', 'd3d2de2ee2dc2de', 'badfagabba', 'fedefdba2aa', 'fgaagfgaba', 'ed2aadfe3efg', 'f/2g/2abafdfab', 'ededcd2dd2dd', 'g/2fadf2ed2aad', 'abagffdfg2f', 'd2a/2g/2fadf2ed2a', 'e2gfgaagfga', 'ddadfagfba', 'dfagabbagf', 'e2ba2fd3d2f/2g/2ab', 'fededcd2dd2d', 'abagabfgag', 'g/2abafdfaba', 'agabfgagec', 'g/2abadfagab', 'cd3d2a/2g/2fadf2e', 'ga2fg2ef2ga3fd', 'adfagfbagf', 'de2ee2dc2de2ad', 'gabagffdfg', 'f3fede2ba2fd3d', 'ff3fede2ba2fd', 'adfagabbag', 'fd3d2f/2g/2abadf', 'bagabfgage', 'agfededcd2d', 'gaagfgfge2g', 'd2de2ee2dc2de2a', 'f2ga3fddadfa', 'fge2gfgaagf', 'ecd3d2a/2g/2fadf', 'a/2g/2fadf2ed2aa', 'gabbagfgaa', 'fg2fedefdba', 'bagffdfg2fe', 'gaagfgabag', 'aagfgfge2gf', 'bfgagecd3d2a', 'ef2ga3fddadf', 'd2f/2g/2abafdfa', 'ede2ba2fd3d2f/2g', 'fdfg2fedefd', 'ge2gfgaagfg', 'fgagecd3d2a/2g', 'dadfagfbag', 'agecd3d2a/2g/2fa', 'de2ad2ef2ff3fe', 'agabbagfga', 'a2fg2ef2ga3fdd', 'gfgfge2gfga', 'abafdfabag', 'e2dc2de2ad2ef2f', 'gfge2gfgaag', 'd2dd2dd3d2de2ee', 'fdfabagabf', 'fbagfededc', 'e2ad2ef2ff3fed', 'f2ed2aadfe3ef', 'a2fd3d2f/2g/2abad', 'd3d2f/2g/2abadfa', 'fgaagfgfge', 'g2fedefdba2a', 'dedcd2dd2dd3d', 'efga2fg2ef2ga', 'ef2ff3fede2ba', 'edcd2dd2dd3d2d', 'gfededcd2dd', 'edefdba2aa2c', 'gffdfg2fede', 'gagecd3d2a/2g/2f', 'fgfge2gfgaa', 'fagfbagfed', 'a3fddadfagf', 'dc2de2ad2ef2ff', 'defdba2aa2cd', 'agfgaagfgf', 'bafdfabaga', 'gabfgagecd', 'ffdfg2fedef', 'adfe3efga2fg', 'agfgabagff', 'fe3efga2fg2ef', 'bbagfgaagf', 'f/2g/2abadfaga', 'fadf2ed2aadf', 'd2aadfe3efga', 'dfagfbagfe', 'd3d2f/2g/2abafdf', 'dd3d2de2ee2dc2d', 'c2de2ad2ef2ff3f', 'e2ee2dc2de2ad2e', 'g2ef2ga3fddad', 'ad2ef2ff3fede', 'fabagabfga', 'dfe3efga2fg2e', 'dcd2dd2dd3d2de', 'de2ba2fd3d2f/2g/2a', 'aadfe3efga2f', 'abfgagecd3d', 'aagfgabagf', 'gfgabagffd', 'd2f/2g/2abadfag', 'gfbagfeded', 'dd2dd3d2de2ee2d', 'fagabbagfg', 'abadfagabb', 'efdba2aa2cd3d', 'agfbagfede', 'gecd3d2a/2g/2fad', 'fddadfagfb', 'bagfgaagfg'])

In [44]:
def plagerism_check(song, song_substrings):
    compare = None
    compare_list = []
    for compare_song in song_substrings:
        overlap = compare_song.intersection(song)
        if compare == None:
            compare = overlap
        else:
            compare = compare.union(overlap)
        compare_list.append(overlap)
    return compare, compare_list

In [45]:
similarity, similiarity_by_song = plagerism_check(my_song_substrings, song_substrings)

In [50]:
for index in range(0, len(similiarity_by_song)):
    overlap = similiarity_by_song[index]
    if len(overlap) > 1:
#         for part in overlap:
#             print part
#         print '-----------'        
        print index, len(overlap)
        print '----------------'
        print melodies[index]
        print '----------------'


9 15
----------------
f/2e/2d2adfed3a2b=cdccge=c3e2^cd2adfed3a2gfdfe=ced3d2f/2g/2abafdfa3=c3g=cgeceg3b3adafdfa3g3fdfeced3d2f/2g/2abafdfabafdfgage=cegage=ceabafdfabagabfgagecd3d2
----------------
19 17
----------------
adeffedbdbafddfab2aceee2adeffedbdbafddfab2aaddd2efgaagfgabbagfgaagfgfge2gfgaagfgabbagfgaefgfddd2
----------------
28 12
----------------
f/2e/2d2abdad3a2b=cdccge=c3e2cd2abdad3a2gfdfe=cededd2f/2g/2abafdfa3=c3g=cgeceg3b3adafdff3a2gfdfecededdfgabafdfabafdfgage=cegage=ceabafdfabagabfgagecd3d2
----------------
84 8
----------------
f/2g/2a2aa2fedefdba2aagfgfgefga2aa2fedefdba2aa2gfddd2d/2e/2fgaagfgabbagfgaagfgfgefga3f3edefdba2aa2gfddd2
----------------
96 4
----------------
adedf2ad2fecag2bf2ae2fgfededf2ad2fecabgfedcd3d2f/2g/2a2fd2fa2aagfg2ec2ea2ggfef2dg2ea2fbagfededcd3dfgagffedadfagfgfeedcacegfefedgfeagfbagfededcd3d2
----------------