In [29]:
import cPickle as pickle
import os
import re
import sqlite3

import nltk
from nltk.stem import PorterStemmer

In [3]:
MSD_DIR = u'/q/boar/boar-p9/MillionSong/'
MSD_LFM_ROOT = os.path.join(MSD_DIR, 'Lastfm')
MSD_ADD = os.path.join(MSD_DIR, 'AdditionalFiles')

In [4]:
tags_dbfile = os.path.join(MSD_LFM_ROOT, 'lastfm_tags.db')
uniq_tag_f = os.path.join(MSD_LFM_ROOT, 'unique_tags.txt')
md_dbfile = 'track_metadata.db'

In [5]:
# shameless steal from
def getVocab(dbc):
    vocab = []
    cur = dbc.cursor()
    cur.execute('''SELECT tag FROM tags''')
    for (term,) in cur:
    return vocab

def getTrackRows(dbc):
    cur = dbc.cursor()
    tid = {}
    cur.execute('''SELECT tid FROM tids''')
    for (i, (track,)) in enumerate(cur, 1):
        tid[track] = i
    return tid

In [6]:
with sqlite3.connect(tags_dbfile) as dbc:
    vocab = getVocab(dbc)
    tid = getTrackRows(dbc)

In [7]:
def tid_to_dir(base_dir, tid, ext='.h5'):
    return os.path.join(base_dir, '/'.join(tid[2:5]), tid + ext)

def sanitize(tag):
    return re.sub(r'(\W|_)+', '', re.sub('(&| n )', 'and', ' '.join([stemmer.stem(token) for token in nltk.word_tokenize(tag.lower())])))

In [8]:
filtered_tags = (# favorate/like/love/blabla
                 'favorites', 'Favorite', 'Favourites', 'favourite', 'favorite songs', 'Favourite Songs', 'favorite song', 
                 'songs i love', 'lovedbybeyondwithin', 'Love it', 'love at first listen', 'fav', 'my favorite', 'top 40', 
                 'songs I absolutely love', 'favs', 'My Favorites', 'Favorite Artists', 'All time favourites', 
                 'personal favourites', 'favouritestreamable', 'favorite tracks', 'Favorite Bands', 'like it', 
                 'I love this song', 'rex ferric faves', 'love to death', 'my gang 09', 'My Favourites', 
                 'BeatbabeBop selection', 'I Like It', 'newbest', 'top', 'IIIIIIIIII AMAZING TRACK :D IIIIIIIIII', 
                 'best songs of the 80s', 'LOVE LOVE LOVE', 'i love it', 'most loved',
                 'favorite by this group', 'amayzes loved', 'DJPMan-loved-tracks', 'best of 2008', 'loved', 
                 'Makes Me Smile', '77davez-all-tracks', 'My pop music', 'best songs ever', 'favorite by this singer', 
                 'I like', 'my music', 'Soundtrack Of My Life', 'UK top 40', 'Like', 
                 'malloy2000 playlist - top songs - classical to metal', 'loved tracks',
                 'top artists', 'all time favorites', 'best songs of the 00s', 'favourite tracks', 'Solomusika-Loved', 
                 'all time faves', 'british i like', 'Jills Station', 'de todo mio favoritos', 'Faves', 'Fave', 
                 'acclaimed music top 3000', 'top 2000', 'leapsandloved', 'Radiotsar approved', 

                 # great/awesome/blabla
                 'kick ass', 'wonderful', 'excellent', 'Great Lyricists', 'badass', 'awesomeness', 'great song', 'Awesome',
                 'cool', 'amazing', 'good', 'nice', 'sweet', 'best', 'FUCKING AWESOME', 'lovely', 'Good Stuff', 'brilliant',
                 'feel good', 'perfect', 'all the best', 'cute', 'the best', '<3', 'interesting', 'feelgood', 'pretty', 
                 'i feel good', 'good shit', 'good music', 'good song', 'great songs', 'yeah', 'best song ever', 'wow', 
                 'worship', 'makes me happy', 'ok', 'damned good', 'underrated', 'Perfection', 'super',
                 # rating
                 '1', '3', '4', '5', '4 Stars', '3 stars', '4 Star', '3 star', '3-star',
                 # year
                 '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', 
                 '2005', '2006', '2007', '2008', '2009', '2010', '00s', '10s', '1950s', '1960s', '1970s', '1980s', '1990s',
                 '2000s', '20th Century', '21st century', "50's", '50s', "60's", '60s', '60s Gold', "70's", '70s', "80's", 
                 '80s', '80s Pop', '80s rock', "90's", '90s', '90s Rock',
                 # descriptive
                 'songwriter', 'singer-songwriter', 'cover', 'covers', 'seen live', 'heard on Pandora', 
                 'title is a full sentence', 'Retro', 'Miscellaneous', 'collection', 'billboard number ones', 'ost', 
                 'cover song', 'singer songwriter', 'new', 'download', 'over 5 minutes long', 'Soundtracks', 
                 'under two minutes', 'albums I own', 'cover songs', 'Radio', 'heard on last-fm', 'Soundtrack',
                 # I don't know what you are talking about
                 'buy', 'lol', 'us', 'other', '2giveme5', 'i am a party girl here is my soundtrack', 'names', 'Tag', 
                 'check out', 'f', 'test', 'out of our heads', 'me', 'I want back to the 80s', '9 lbs hammer', 'yes',
                 'streamable track wants', 'aitch', 'slgdmbestof', 'gotanygoodmusic', 'Brems tagg radio', 'gh 3',
                 'Sousaphonic AOTM 201102', 'fH Projex', 'GH10', 'Ion B radio', 'ik ben', 'quarkzangsun v1', 

In [9]:
stag_to_tag = dict()
stemmer = PorterStemmer()

# we only pick the tags with >= 1000 counts, otherwise it's just too noisy
# e.g. "writing papers to pay for the college you have gotten into" has 13 counts 
with open(uniq_tag_f, 'rb') as f:
    for line in f:
            tag, count = line.strip().split('\t', 2)
            if int(count) >= 1000:
                if not tag in filtered_tags:
                    stag = sanitize(tag)
                    if stag in stag_to_tag:
                        stag_to_tag[stag] = [tag]
                # since the file is ordered by count
        except ValueError as e:
            print 'The following line raises the error:', e
            # there is one line with no tag information, but with less than 1000 counts
            print line

In [10]:
tags = sorted(stag_to_tag.keys())

In [11]:


In [2]:
import json

with open('stag_to_tag.json') as f:
    stag_to_tag = json.load(f)

In [3]:

{u'131': [u'131'],
 u'acidjazz': [u'acid jazz'],
 u'acordgenial': [u'acordes geniales'],
 u'acoust': [u'acoustic'],
 u'acoustguitar': [u'acoustic guitar'],
 u'acoustrock': [u'Acoustic Rock'],
 u'addict': [u'addictive'],
 u'adrienwayn': [u'adrien wayne'],
 u'adultaltern': [u'Adult Alternative'],
 u'adultcontemporari': [u'adult contemporary'],
 u'africa': [u'africa'],
 u'african': [u'african'],
 u'aggress': [u'aggressive'],
 u'albumrock': [u'album rock'],
 u'allboutguitar': [u'allboutguitar'],
 u'altcountri': [u'Alt-country', u'alt country'],
 u'altern': [u'alternative'],
 u'alternmetal': [u'alternative metal'],
 u'alternpop': [u'alternative pop'],
 u'alternpunk': [u'Alternative  Punk'],
 u'alternrock': [u'alternative rock', u'Alternate Rock'],
 u'altrock': [u'alt rock'],
 u'ambient': [u'ambient'],
 u'american': [u'american'],
 u'americana': [u'americana'],
 u'angri': [u'angry'],
 u'anim': [u'animals'],
 u'annymix': [u'annymix'],
 u'anthem': [u'anthem'],
 u'arenarock': [u'arena rock'],
 u'artrock': [u'art rock'],
 u'asubtluseofvocalharmoni': [u'a subtle use of vocal harmony'],
 u'atmospher': [u'atmospheric', u'Atmosphere'],
 u'attitud': [u'attitude'],
 u'audioas': [u'audioase'],
 u'australian': [u'australian'],
 u'avantgard': [u'Avant-Garde', u'avantgarde'],
 u'avocalcentraesthet': [u'a vocal-centric aesthetic'],
 u'awesomguitarjam': [u'Awesome Guitar Jams'],
 u'ballad': [u'Ballad', u'ballads'],
 u'basicbass': [u'Basically Bass'],
 u'bass': [u'bass'],
 u'beat': [u'beats', u'beat'],
 u'beauti': [u'beautiful'],
 u'bebop': [u'bebop'],
 u'berlin': [u'Berlin'],
 u'bigband': [u'Big Band'],
 u'bigbeat': [u'big beat'],
 u'bittersweet': [u'Bittersweet'],
 u'black': [u'Black'],
 u'blackmetal': [u'black metal'],
 u'blingtacular': [u'blingtacular'],
 u'blue': [u'blues', u'blue'],
 u'bluegrass': [u'bluegrass'],
 u'bluerock': [u'blues rock'],
 u'bluesrock': [u'blues-rock'],
 u'bossanova': [u'Bossa Nova'],
 u'bounci': [u'bouncy'],
 u'brain2brain': [u'brain2brain'],
 u'brasil': [u'brasil'],
 u'brazil': [u'brazil'],
 u'brazilian': [u'brazilian'],
 u'break': [u'breaks'],
 u'breakbeat': [u'breakbeat'],
 u'bremtaggradio': [u'Brems Tagg radio'],
 u'british': [u'british'],
 u'britishrock': [u'british rock'],
 u'britpop': [u'britpop', u'brit pop'],
 u'britrock': [u'brit rock', u'Britrock'],
 u'brutal': [u'brutal'],
 u'brutaldeathmetal': [u'Brutal Death Metal'],
 u'california': [u'California'],
 u'calm': [u'calm', u'Calming'],
 u'canada': [u'canada'],
 u'canadian': [u'Canadian'],
 u'catchi': [u'catchy'],
 u'celestnostalgia': [u'celeste nostalgia'],
 u'celtic': [u'celtic'],
 u'chanson': [u'chanson'],
 u'chansonfrancais': [u'chanson francaise'],
 u'chill': [u'chill', u'chilled'],
 u'chillout': [u'chillout', u'chill out'],
 u'christian': [u'christian'],
 u'christianrock': [u'christian rock'],
 u'christma': [u'christmas'],
 u'cinemat': [u'cinematic'],
 u'classic': [u'classic', u'Classical', u'classics'],
 u'classicblue': [u'Classic Blues'],
 u'classiccountri': [u'classic country'],
 u'classicmetal': [u'classic metal'],
 u'classicrock': [u'classic rock'],
 u'classicsoul': [u'classic soul'],
 u'closeharmoni': [u'close harmony'],
 u'club': [u'club'],
 u'comedi': [u'comedy'],
 u'contemporarichristian': [u'contemporary christian'],
 u'contemporariclassic': [u'contemporary classical'],
 u'contemporarigospeltag': [u'Contemporary Gospel Tag'],
 u'contemporarijazz': [u'contemporary jazz'],
 u'cooljazz': [u'cool jazz'],
 u'countri': [u'country'],
 u'countrirock': [u'country rock'],
 u'crazi': [u'crazy'],
 u'crossov': [u'crossover'],
 u'crow': [u'crowe'],
 u'danc': [u'dance', u'dancing'],
 u'danceabl': [u'danceable'],
 u'dancehal': [u'dancehall'],
 u'dancemania': [u'Dancemania'],
 u'dancepop': [u'dance-pop'],
 u'dancparti': [u'dance party'],
 u'dancpop': [u'dance pop'],
 u'dark': [u'dark'],
 u'darkambient': [u'dark ambient'],
 u'darkelectro': [u'dark electro'],
 u'darkwav': [u'darkwave'],
 u'death': [u'death'],
 u'deathcor': [u'deathcore'],
 u'deathmetal': [u'death metal'],
 u'deep': [u'deep'],
 u'deephous': [u'deep house'],
 u'deltablue': [u'delta blues'],
 u'depress': [u'depressing'],
 u'detroit': [u'detroit'],
 u'deutsch': [u'deutsch'],
 u'deutschrock': [u'Deutschrock'],
 u'disco': [u'Disco'],
 u'diva': [u'diva'],
 u'dj': [u'dj'],
 u'dnb': [u'dnb'],
 u'doom': [u'Doom'],
 u'doommetal': [u'doom metal'],
 u'doowop': [u'doo wop'],
 u'downbeat': [u'downbeat'],
 u'downtempo': [u'downtempo'],
 u'dramat': [u'dramatic'],
 u'dream': [u'Dream'],
 u'dreami': [u'Dreamy'],
 u'dreampop': [u'dream pop'],
 u'drive': [u'Driving'],
 u'drjazzmrfunkmus': [u'drjazzmrfunkmusic'],
 u'drone': [u'drone'],
 u'drug': [u'drugs'],
 u'drum': [u'drums'],
 u'drumandbass': [u'Drum and bass', u'Drum n Bass'],
 u'dub': [u'dub'],
 u'dubstep': [u'dubstep'],
 u'dutch': [u'dutch'],
 u'easi': [u'easy'],
 u'easilisten': [u'easy listening'],
 u'eastcoastrap': [u'east coast rap'],
 u'ebm': [u'ebm'],
 u'eddi': [u'eddie'],
 u'eighti': [u'eighties'],
 u'electro': [u'electro'],
 u'electroclash': [u'Electroclash'],
 u'electrohous': [u'electro house'],
 u'electron': [u'electronic'],
 u'electronica': [u'electronica'],
 u'electronicad': [u'ElectronicaDance'],
 u'electropop': [u'electropop', u'Electro Pop', u'electro-pop'],
 u'emo': [u'emo'],
 u'emocor': [u'emocore'],
 u'emot': [u'emotional', u'emotive'],
 u'emus': [u'emusic'],
 u'energet': [u'Energetic'],
 u'energi': [u'energy'],
 u'england': [u'england'],
 u'english': [u'english'],
 u'epic': [u'epic'],
 u'espanol': [u'Espanol'],
 u'essenti': [u'essentials'],
 u'ether': [u'ethereal'],
 u'ethnic': [u'ethnic'],
 u'eurod': [u'eurodance'],
 u'european': [u'european'],
 u'exceltune': [u'Excellent tune'],
 u'experiment': [u'experimental'],
 u'experimentrock': [u'Experimental Rock'],
 u'extensvamp': [u'extensive vamping'],
 u'fast': [u'fast'],
 u'femal': [u'female'],
 u'femalartist': [u'female artists'],
 u'femalfrontmetal': [u'Female fronted metal'],
 u'femalsinger': [u'female singers'],
 u'femalvocal': [u'female vocals', u'female vocal'],
 u'femalvocalist': [u'female vocalists', u'female vocalist'],
 u'femalvoic': [u'Female Voices'],
 u'finland': [u'finland'],
 u'finnish': [u'finnish'],
 u'finnishmetal': [u'finnish metal'],
 u'flamenco': [u'Flamenco'],
 u'flute': [u'flute'],
 u'folk': [u'folk'],
 u'folkmetal': [u'folk metal'],
 u'folkrock': [u'folk rock', u'folk-rock'],
 u'fon': [u'fon'],
 u'franc': [u'france'],
 u'francai': [u'francais'],
 u'freedom': [u'freedom'],
 u'french': [u'french'],
 u'friendsofthekingofrummelpop': [u'friendsofthekingofrummelpop'],
 u'fun': [u'fun'],
 u'funk': [u'funk'],
 u'funki': [u'funky'],
 u'funni': [u'funny'],
 u'fusion': [u'Fusion'],
 u'gangstarap': [u'Gangsta Rap'],
 u'garag': [u'garage'],
 u'garagrock': [u'Garage Rock'],
 u'geil': [u'geil'],
 u'geniu': [u'genius'],
 u'german': [u'german'],
 u'germani': [u'germany'],
 u'girlpower': [u'girl power'],
 u'gitarrenunterricht': [u'Gitarrenunterricht'],
 u'glam': [u'glam'],
 u'glammetal': [u'Glam Metal'],
 u'glamrock': [u'glam rock'],
 u'glitch': [u'glitch'],
 u'goa': [u'goa'],
 u'goldenoldi': [u'golden oldies'],
 u'goodbeat': [u'good beat'],
 u'goodmood': [u'good mood'],
 u'gorgeou': [u'gorgeous'],
 u'gospel': [u'gospel'],
 u'goth': [u'goth'],
 u'gothic': [u'Gothic'],
 u'gothicmetal': [u'Gothic Metal'],
 u'gothicrock': [u'Gothic Rock'],
 u'gothrock': [u'goth rock'],
 u'great': [u'great'],
 u'greatlyric': [u'great lyrics'],
 u'grindcor': [u'grindcore'],
 u'groov': [u'groove'],
 u'groovi': [u'groovy'],
 u'grung': [u'Grunge'],
 u'guiltipleasur': [u'Guilty Pleasures', u'guilty pleasure'],
 u'guitar': [u'guitar'],
 u'guitarhero': [u'Guitar Hero'],
 u'guitarsolo': [u'Guitar Solo'],
 u'guitarvirtuoso': [u'guitar virtuoso'],
 u'gutelaun': [u'gute laune'],
 u'hairmetal': [u'hair metal'],
 u'halftonsinglclub': [u'halftoned singles club'],
 u'halloween': [u'halloween'],
 u'handclap': [u'handclaps'],
 u'happi': [u'happy'],
 u'hard': [u'Hard'],
 u'hardcor': [u'hardcore'],
 u'hardcorpunk': [u'hardcore punk'],
 u'hardrock': [u'hard rock'],
 u'harmonica': [u'harmonica'],
 u'haunt': [u'haunting'],
 u'heartbreak': [u'heartbreak'],
 u'heavi': [u'heavy'],
 u'heavimetal': [u'heavy metal'],
 u'highschool': [u'High School'],
 u'hiphop': [u'Hip-Hop', u'hip hop', u'hiphop'],
 u'horn': [u'horns'],
 u'hot': [u'hot'],
 u'hous': [u'House'],
 u'humor': [u'humor'],
 u'hypnot': [u'hypnotic'],
 u'idm': [u'idm'],
 u'indi': [u'indie'],
 u'indietronica': [u'indietronica'],
 u'indifolk': [u'indie folk'],
 u'indipop': [u'indie pop'],
 u'indirock': [u'indie rock'],
 u'industri': [u'industrial'],
 u'industrimetal': [u'industrial metal'],
 u'industrirock': [u'industrial rock'],
 u'inspir': [u'inspirational', u'inspiring'],
 u'instrument': [u'instrumental'],
 u'instrumentjazztag': [u'Instrumental Jazz Tag'],
 u'instrumentrock': [u'instrumental rock'],
 u'intens': [u'intense'],
 u'irish': [u'irish'],
 u'italian': [u'italian'],
 u'italiana': [u'italiana'],
 u'jam': [u'Jam'],
 u'jamaica': [u'jamaica'],
 u'japanes': [u'japanese'],
 u'jazz': [u'jazz'],
 u'jazzfunk': [u'jazz funk'],
 u'jazzfusion': [u'jazz fusion'],
 u'jazzi': [u'jazzy'],
 u'jazzinstrument': [u'jazz instrumental'],
 u'jazzpiano': [u'jazz piano'],
 u'jazzrock': [u'Jazz Rock'],
 u'jazzvocal': [u'jazz vocal'],
 u'karlsruh': [u'Karlsruhe'],
 u'lacrimaindark': [u'lacrimaindarkness'],
 u'latenight': [u'late night'],
 u'latin': [u'latin'],
 u'latinjazz': [u'latin jazz'],
 u'latino': [u'latino'],
 u'latinpop': [u'latin pop'],
 u'latinrock': [u'Latin Rock'],
 u'legend': [u'legend'],
 u'life': [u'life'],
 u'light': [u'light'],
 u'lined': [u'linedance'],
 u'live': [u'live'],
 u'lofi': [u'Lo-Fi'],
 u'london': [u'london'],
 u'loneliafterdusk': [u'loneliness after dusk'],
 u'loud': [u'loud'],
 u'loung': [u'lounge'],
 u'love': [u'Love'],
 u'lovesong': [u'love songs', u'love song', u'lovesongs'],
 u'ls': [u'ls'],
 u'lush': [u'lush'],
 u'lyric': [u'lyrics'],
 u'magic': [u'magic'],
 u'majorkeytonal': [u'major key tonality'],
 u'male': [u'male'],
 u'malesing': [u'malesinger'],
 u'malevocal': [u'male vocals', u'male vocal'],
 u'malevocalist': [u'male vocalists', u'male vocalist'],
 u'masterpiec': [u'Masterpiece'],
 u'medit': [u'Meditation'],
 u'melanchol': [u'melancholic'],
 u'melancholi': [u'melancholy'],
 u'mellow': [u'Mellow'],
 u'melod': [u'melodic'],
 u'melodblackmetal': [u'melodic black metal'],
 u'meloddeathmetal': [u'Melodic Death Metal'],
 u'melodhardcor': [u'melodic hardcore'],
 u'melodmetal': [u'melodic metal'],
 u'melodrock': [u'melodic rock'],
 u'melodtranc': [u'melodic trance'],
 u'memori': [u'memories'],
 u'metal': [u'metal', u'metallis'],
 u'metalcor': [u'metalcore'],
 u'metrodowntempo': [u'metro downtempo'],
 u'metroelectronica': [u'metro electronica'],
 u'metrofunki': [u'metro funky'],
 u'metrojazz': [u'Metro Jazz'],
 u'mid': [u'mid'],
 u'mildrhythmicsyncop': [u'mild rhythmic syncopation'],
 u'minim': [u'minimal'],
 u'minimtechno': [u'minimal techno'],
 u'minorkeytonal': [u'minor key tonality'],
 u'mod': [u'mod'],
 u'modernrock': [u'modern rock'],
 u'moodi': [u'moody'],
 u'morn': [u'morning'],
 u'motown': [u'motown'],
 u'mpb': [u'mpb'],
 u'music': [u'music'],
 u'musicspirit': [u'musicspirit'],
 u'musictofallasleepto': [u'music to fall asleep to'],
 u'neosoul': [u'Neo-Soul', u'Neo Soul'],
 u'newage': [u'new age'],
 u'newromant': [u'new romantic'],
 u'newwave': [u'new wave'],
 u'newyork': [u'new york'],
 u'night': [u'night'],
 u'ninjatune': [u'ninja tune'],
 u'nois': [u'noise'],
 u'noisrock': [u'noise rock'],
 u'northernsoul': [u'northern soul'],
 u'norwegian': [u'norwegian'],
 u'nostalg': [u'nostalgic'],
 u'nostalgia': [u'nostalgia'],
 u'nujazz': [u'nu jazz', u'nu-jazz'],
 u'numet': [u'Nu-metal'],
 u'numetal': [u'Nu Metal'],
 u'nyc': [u'NYC'],
 u'oi': [u'Oi'],
 u'oldfavorit': [u'old favorites'],
 u'oldi': [u'oldies'],
 u'oldschool': [u'old school'],
 u'oldschoolsoul': [u'Old School soul'],
 u'oldskool': [u'old skool'],
 u'parti': [u'party'],
 u'partimusic': [u'party music'],
 u'peac': [u'peaceful'],
 u'piano': [u'piano'],
 u'pianorock': [u'piano rock'],
 u'play': [u'Playful'],
 u'poetri': [u'poetry'],
 u'polish': [u'polish'],
 u'polit': [u'political'],
 u'pop': [u'pop'],
 u'poplife': [u'Pop Life'],
 u'poppunk': [u'pop punk', u'Pop-punk'],
 u'poprock': [u'pop rock', u'Pop-Rock', u'poprock'],
 u'popular': [u'popular'],
 u'posit': [u'positive'],
 u'postgrung': [u'post-grunge'],
 u'posthardcor': [u'post-hardcore', u'post hardcore'],
 u'postpunk': [u'post-punk', u'Post punk'],
 u'postrock': [u'post-rock', u'post rock'],
 u'power': [u'powerful', u'power'],
 u'powerballad': [u'Power ballad'],
 u'powermetal': [u'Power metal'],
 u'powerpop': [u'power pop', u'powerpop'],
 u'prda': [u'prda'],
 u'prog': [u'prog'],
 u'progress': [u'Progressive'],
 u'progressdeathmetal': [u'progressive death metal'],
 u'progresshous': [u'Progressive House'],
 u'progressiv': [u'progressiv'],
 u'progressmetal': [u'Progressive metal'],
 u'progressrock': [u'Progressive rock'],
 u'progresstranc': [u'progressive trance'],
 u'progrock': [u'prog rock'],
 u'protopunk': [u'proto-punk'],
 u'psychedel': [u'psychedelic'],
 u'psychedelrock': [u'Psychedelic Rock'],
 u'psychil': [u'psychill'],
 u'psychobilli': [u'psychobilly'],
 u'psytranc': [u'psytrance'],
 u'punk': [u'punk'],
 u'punkfavorit': [u'Punk Favorites'],
 u'punkrock': [u'punk rock', u'punkrock'],
 u'q3': [u'q3'],
 u'quiet': [u'quiet'],
 u'quietstorm': [u'quiet storm'],
 u'quirki': [u'quirky'],
 u'rain': [u'rain'],
 u'rainiday': [u'Rainy Day'],
 u'randb': [u'r&b', u'r and b'],
 u'rap': [u'rap'],
 u'rapcor': [u'rapcore'],
 u'rave': [u'Rave'],
 u'rb': [u'RB'],
 u'reflect': [u'Reflective'],
 u'regga': [u'reggae'],
 u'relax': [u'relax', u'relaxing', u'relaxed'],
 u'remix': [u'remix'],
 u'repetitmelodphrase': [u'repetitive melodic phrasing'],
 u'rhythmandblue': [u'rhythm and blues'],
 u'rhythumandbluetag': [u'rhythum and blues tag'],
 u'rnb': [u'rnb'],
 u'rock': [u'rock'],
 u'rockabilli': [u'rockabilly'],
 u'rockandroll': [u'rock n roll', u'Rock and Roll'],
 u'rockballad': [u'rock ballad'],
 u'rockenespanol': [u'Rock en Espanol'],
 u'rockin': [u'rockin'],
 u'rockpop': [u'RockPop'],
 u'rockroll': [u'Rock  Roll'],
 u'rocksteadi': [u'rocksteady'],
 u'romanc': [u'romance'],
 u'romant': [u'romantic'],
 u'romantica': [u'Romantica'],
 u'root': [u'roots'],
 u'rootregga': [u'roots reggae'],
 u'sad': [u'sad'],
 u'salsa': [u'salsa'],
 u'samba': [u'samba'],
 u'sanfrancisco': [u'san francisco'],
 u'sax': [u'sax'],
 u'saxophon': [u'saxophone'],
 u'scandinavian': [u'scandinavian'],
 u'scottish': [u'Scottish'],
 u'screamo': [u'screamo'],
 u'seattl': [u'seattle'],
 u'sensual': [u'sensual'],
 u'sentiment': [u'Sentimental'],
 u'sex': [u'sex'],
 u'sexi': [u'sexy'],
 u'shoegaz': [u'shoegaze'],
 u'silentintens': [u'silent intensity'],
 u'singalong': [u'sing along', u'singalong'],
 u'singer': [u'singer'],
 u'sixti': [u'sixties'],
 u'ska': [u'ska'],
 u'skapunk': [u'ska punk'],
 u'sleek': [u'sleek'],
 u'sleep': [u'Sleep'],
 u'slgdm': [u'slgdm'],
 u'slordig': [u'slordig'],
 u'slow': [u'slow'],
 u'slowjam': [u'slow jams'],
 u'sludg': [u'Sludge'],
 u'smooth': [u'smooth'],
 u'smoothjazz': [u'Smooth Jazz'],
 u'soft': [u'soft'],
 u'softrock': [u'soft rock'],
 u'solx': [u'solx'],
 u'somafm': [u'somafm'],
 u'sommer': [u'Sommer'],
 u'sooth': [u'soothing'],
 u'soul': [u'soul', u'soulful'],
 u'soulandrnbclassic': [u'soul and rnb classics'],
 u'soultag': [u'soul tag'],
 u'soundstorm': [u'sound storm'],
 u'southernrock': [u'Southern Rock'],
 u'space': [u'space'],
 u'spacerock': [u'space rock'],
 u'spanish': [u'spanish'],
 u'spanishrock': [u'Spanish Rock'],
 u'special': [u'special'],
 u'speed': [u'speed'],
 u'speedmetal': [u'speed metal'],
 u'spiritu': [u'spiritual'],
 u'spokenword': [u'spoken word'],
 u'spring': [u'spring'],
 u'stoner': [u'stoner'],
 u'stonerrock': [u'Stoner Rock'],
 u'stonesoup': [u'stonesoup'],
 u'stream': [u'stream'],
 u'string': [u'strings'],
 u'summer': [u'summer'],
 u'sunday': [u'sunday'],
 u'sunni': [u'sunny'],
 u'suomi': [u'Suomi'],
 u'suomipop': [u'SuomiPop'],
 u'suomirock': [u'suomirock'],
 u'surf': [u'Surf'],
 u'sweden': [u'Sweden'],
 u'swedish': [u'swedish'],
 u'swedishmetal': [u'Swedish Metal'],
 u'swing': [u'swing'],
 u'symphonmetal': [u'symphonic metal'],
 u'symphonrock': [u'Symphonic Rock'],
 u'synth': [u'synth'],
 u'synthpop': [u'synthpop', u'synth pop'],
 u'tantotempotast': [u'tantotempotaste'],
 u'techhous': [u'tech house'],
 u'technicdeathmetal': [u'Technical Death Metal'],
 u'techno': [u'techno'],
 u'temazo': [u'temazo'],
 u'texa': [u'texas'],
 u'thrash': [u'thrash'],
 u'thrashmetal': [u'thrash metal'],
 u'torquemada': [u'Torquemada'],
 u'tranc': [u'trance'],
 u'triphop': [u'trip-hop', u'trip hop'],
 u'trippi': [u'trippy'],
 u'trumpet': [u'trumpet'],
 u'twee': [u'twee'],
 u'uk': [u'UK'],
 u'underground': [u'underground'],
 u'undergroundhiphop': [u'underground hip-hop', u'underground hip hop'],
 u'upbeat': [u'upbeat'],
 u'uplift': [u'Uplifting'],
 u'uplifttranc': [u'uplifting trance'],
 u'urban': [u'urban'],
 u'usa': [u'USA'],
 u'vikemetal': [u'viking metal'],
 u'violin': [u'violin'],
 u'vocal': [u'vocal', u'vocals', u'vocalization'],
 u'vocalhous': [u'vocal house'],
 u'vocaljazz': [u'vocal jazz'],
 u'vocaltranc': [u'vocal trance'],
 u'warm': [u'warm'],
 u'weird': [u'weird'],
 u'westcoast': [u'west coast'],
 u'winter': [u'winter'],
 u'work': [u'work'],
 u'workout': [u'Workout'],
 u'world': [u'world'],
 u'worldfusion': [u'world fusion'],
 u'worldmusic': [u'World Music'],
 u'xma': [u'xmas']}

In [12]:
voc_to_num = dict((tag, i) for (i, tag) in enumerate(tags))

In [14]:
def getArtistTracks(cur, aid):
    cur.execute("SELECT track_id FROM songs WHERE artist_id='%s'" % aid)
    for (track, ) in cur_md:
        yield track
def getValidTrackTags(cur, track, tid, vocab, voc_to_num):
    cur.execute("SELECT tag, val FROM tid_tag WHERE tid = %d AND val > 0" % tid[track])
    out = {}
    for (tag, val) in cur:
        stag = sanitize(vocab[tag-1])
        if stag not in voc_to_num:
        if voc_to_num[stag] in out: 
            new_val = min(100, out[voc_to_num[stag]] + float(val))
            out[voc_to_num[stag]] = new_val
            out[voc_to_num[stag]] = float(val)
    return out

def numberize(infile, outfile, cur_md, cur_td, tid, vocab, voc_to_num):
    with open(infile, 'rb') as fr, open(outfile, 'wb') as fw:
        for line in fr:
            aid = line.strip()
            for track in getArtistTracks(cur_md, aid):
                if track not in tid:
                out = getValidTrackTags(cur_td, track, tid, vocab, voc_to_num)
                if len(out) != 0:
                    fw.write('%s\t%s\n' % (track, ' '.join('%d:%.1f' % pair for pair in out.items())))

In [15]:
# turn the whole MSD tags to numbers
with sqlite3.connect(md_dbfile) as conn_md, sqlite3.connect(tags_dbfile) as conn_td:
    cur_md = conn_md.cursor()
    cur_td = conn_td.cursor()
    # artists_train.txt and artists_test.txt can be obtained from 
    numberize('artists_train.txt', 'tracks_tag_train.num', cur_md, cur_td, tid, vocab, voc_to_num)
    numberize('artists_test.txt', 'tracks_tag_test.num', cur_md, cur_td, tid, vocab, voc_to_num)

In [36]:
def densify_and_save(infile, ncol):
    with open(infile, 'rb') as fr:
        for line in fr:
            tmp = line.split('\t', 2)
            tid = tmp[0].strip()
            tdir = os.path.join('vq_hist', '/'.join(tid[2:5]))
            # this folder should already exist
            assert os.path.exists(tdir)
            pairs = tmp[-1].strip().split()
            keyvals = [p.split(':') for p in pairs]
            keyvals = [(int(key), float(val)) for key, val in keyvals]
            row = np.zeros((ncol, ), dtype=np.int16)
            for (k, v) in keyvals:
                row[k] = v
  , tid + '_BoT'), row)

In [37]:
densify_and_save('tracks_tag_train.num', len(tags))
densify_and_save('tracks_tag_test.num', len(tags))