In [1]:
import os, sys, time
import tarfile
import pickle as pkl

In [2]:
data_dir = 'data'
msd_dir = os.path.join(data_dir, 'msd/')
fmap  = os.path.join(data_dir, 'aotm-2011/songID2TrackID.pkl')

Data loading

Load song_id --> track_id mapping: a song may correspond to multiple tracks.


In [3]:
song2TrackID = pkl.load(open(fmap, 'rb'))

In [4]:
{ k : song2TrackID[k] for k in sorted(song2TrackID.keys())[:10] }


Out[4]:
{'SOAAAFI12A6D4F9C66': ['TRZEXLQ128F1491D17'],
 'SOAAAMT12AB018C9C4': ['TRYIOYF12903CD4E73'],
 'SOAAASR12AB018A516': ['TRRTOHC12903CDD2EA'],
 'SOAABHX12AAF3B40E7': ['TRZARKN128F92DE096'],
 'SOAABLG12A6D4F73D2': ['TRFUCDA128F1455C96'],
 'SOAABMP12A6D4F7633': ['TRQVPBD128F1458060'],
 'SOAABMR12A6D4F70E4': ['TRVMASA128F149BB53'],
 'SOAABQL12A67020E76': ['TRJOVXI128E0791CFA'],
 'SOAABRB12A58A792A3': ['TRZGOQN128F935F425'],
 'SOAACBE12A6D4F7A54': ['TRHRWOM128F1466747']}

In [5]:
trackIDs = sorted({trackID for value in song2TrackID.values() for trackID in value})

In [6]:
len(trackIDs)


Out[6]:
119953

In [7]:
trackIDs[:10]


Out[7]:
['TRAAABD128F429CF47',
 'TRAAAHJ128F931194C',
 'TRAAAHZ128E0799171',
 'TRAAANK128F428B515',
 'TRAAAUR128F428B1FA',
 'TRAAAYL128F4271A5B',
 'TRAABHB12903CAFC2F',
 'TRAABIG128F9356C56',
 'TRAABJS128F9325C99',
 'TRAABPG128F14774DD']

In [8]:
%%script false
# TOO slow!
tar = None
flag = None
cnt = 0
for trackID in trackIDs:
    cnt += 1
    sys.stdout.write("\r%d / %d" % (cnt, len(trackIDs)))
    sys.stdout.flush()
    
    ftrack = trackID[2] + '/' + trackID[3] + '/' + trackID[4] + '/' + trackID + '.h5'
    
    # practical solution: generate bash commands
    # workflow: extract .tar.gz -> move files-of-interest -> remove all extracted files; repeat this.
    #fnew = 'files/' + trackID + '.h5'
    #if os.path.exists(os.path.join(data_dir, fnew)): continue
    #print('mv', ftrack, fnew)
    
    fnew = os.path.join(data_dir, 'files/' + trackID + '.h5')
    ftar = os.path.join(msd_dir, trackID[2] + '.tar.gz')
    
    if os.path.exists(fnew): continue
    
    if flag is None or flag != trackID[2]:
        flag = trackID[2]
        if tar is not None:
            tar.close()    
        tar = tarfile.open(name=ftar, mode='r:gz')
        
    if tar is None:
        tar = tarfile.open(name=ftar, mode='r:gz')
    
    fdr = tar.extractfile(ftrack)
    with open(fnew, 'wb') as fdw:
        fdw.write(fdr.read())
    fdr.close()

In [16]:
# practical solution: generate bash commands
# workflow: extract .tar.gz -> move files-of-interest -> remove all extracted files; repeat this.

trackIDs = sorted(trackIDs)
cnt = 0
flag = None
fscript = os.path.join(data_dir, 'msd/extract_aotm2011.sh')
with open(fscript, 'w') as fd:
    for trackID in trackIDs:
        cnt += 1
        if cnt % 100 == 0:
            sys.stdout.write("\r%d / %d" % (cnt, len(trackIDs))); sys.stdout.flush()
        
        fnew = 'files/' + trackID + '.h5'
        if os.path.exists(fnew): continue
        ftrack = trackID[2] + '/' + trackID[3] + '/' + trackID[4] + '/' + trackID + '.h5'
        
        if flag is None or flag != trackID[2]:
            if flag is not None:
                fd.write('rm -rf %s/ \n' % flag)
            flag = trackID[2]
            fd.write('echo "extracting %s"\n' % flag)
            fd.write('tar xzf %s.tar.gz\n' % flag)
            
        fd.write('mv %s %s\n' % (ftrack, fnew))


119900 / 119953

In [ ]: