In [1]:
import os, sys, time
import tarfile
import pickle as pkl
In [2]:
data_dir = 'data'
msd_dir = os.path.join(data_dir, 'msd/')
fmap = os.path.join(data_dir, 'aotm-2011/songID2TrackID.pkl')
Load song_id
--> track_id
mapping: a song may correspond to multiple tracks.
In [3]:
song2TrackID = pkl.load(open(fmap, 'rb'))
In [4]:
{ k : song2TrackID[k] for k in sorted(song2TrackID.keys())[:10] }
Out[4]:
In [5]:
trackIDs = sorted({trackID for value in song2TrackID.values() for trackID in value})
In [6]:
len(trackIDs)
Out[6]:
In [7]:
trackIDs[:10]
Out[7]:
In [8]:
%%script false
# TOO slow!
tar = None
flag = None
cnt = 0
for trackID in trackIDs:
cnt += 1
sys.stdout.write("\r%d / %d" % (cnt, len(trackIDs)))
sys.stdout.flush()
ftrack = trackID[2] + '/' + trackID[3] + '/' + trackID[4] + '/' + trackID + '.h5'
# practical solution: generate bash commands
# workflow: extract .tar.gz -> move files-of-interest -> remove all extracted files; repeat this.
#fnew = 'files/' + trackID + '.h5'
#if os.path.exists(os.path.join(data_dir, fnew)): continue
#print('mv', ftrack, fnew)
fnew = os.path.join(data_dir, 'files/' + trackID + '.h5')
ftar = os.path.join(msd_dir, trackID[2] + '.tar.gz')
if os.path.exists(fnew): continue
if flag is None or flag != trackID[2]:
flag = trackID[2]
if tar is not None:
tar.close()
tar = tarfile.open(name=ftar, mode='r:gz')
if tar is None:
tar = tarfile.open(name=ftar, mode='r:gz')
fdr = tar.extractfile(ftrack)
with open(fnew, 'wb') as fdw:
fdw.write(fdr.read())
fdr.close()
In [16]:
# practical solution: generate bash commands
# workflow: extract .tar.gz -> move files-of-interest -> remove all extracted files; repeat this.
trackIDs = sorted(trackIDs)
cnt = 0
flag = None
fscript = os.path.join(data_dir, 'msd/extract_aotm2011.sh')
with open(fscript, 'w') as fd:
for trackID in trackIDs:
cnt += 1
if cnt % 100 == 0:
sys.stdout.write("\r%d / %d" % (cnt, len(trackIDs))); sys.stdout.flush()
fnew = 'files/' + trackID + '.h5'
if os.path.exists(fnew): continue
ftrack = trackID[2] + '/' + trackID[3] + '/' + trackID[4] + '/' + trackID + '.h5'
if flag is None or flag != trackID[2]:
if flag is not None:
fd.write('rm -rf %s/ \n' % flag)
flag = trackID[2]
fd.write('echo "extracting %s"\n' % flag)
fd.write('tar xzf %s.tar.gz\n' % flag)
fd.write('mv %s %s\n' % (ftrack, fnew))
In [ ]: