In [1]:
import cPickle as pickle
import itertools
import json
import operator
import os
import scipy.sparse
import hdf5_getters
import HartiganOnline, VectorQuantizer
from joblib import Parallel, delayed
In [2]:
MSD_DIR = u'/q/boar/boar-p9/MillionSong/'
MSD_DATA_ROOT = os.path.join(MSD_DIR, 'data')
MSD_LFM_ROOT = os.path.join(MSD_DIR, 'Lastfm')
MSD_ADD = os.path.join(MSD_DIR, 'AdditionalFiles')
In [3]:
# get all the tracks with non-nan hotttnesss
def get_all_song_hotttnesss(msd_dir, ext='.h5') :
track_to_hotttnesss = dict()
msd_data_root = os.path.join(msd_dir, 'data')
with open(os.path.join(msd_dir, 'AdditionalFiles', 'unique_tracks.txt'), 'rb') as f:
for (count, line) in enumerate(f):
track_ID, _, _, _ = line.strip().split('<SEP>')
track_dir = os.path.join(msd_data_root, '/'.join(track_ID[2:5]), track_ID + ext)
h5 = hdf5_getters.open_h5_file_read(track_dir)
hotttnesss = hdf5_getters.get_song_hotttnesss(h5)
if not math.isnan(hotttnesss):
track_to_hotttnesss[track_ID] = hotttnesss
h5.close()
if not count % 1000:
print "%7d tracks processed" % count
return track_to_hotttnesss
In [4]:
if os.path.exists('track_to_hotttnesss.json'):
with open('track_to_hotttnesss.json', 'rb') as f:
track_to_hotttnesss = json.load(f)
else:
track_to_hotttnesss = get_all_song_hotttnesss(MSD_DIR)
with open('track_to_hotttnesss.json', 'wb') as f:
json.dump(track_to_hotttnesss, f)
In [5]:
# see some track-hotttnesss pairs
track_to_hotttnesss_ordered = sorted(track_to_hotttnesss.iteritems(), key=operator.itemgetter(1), reverse=True)
for i in xrange(0, 50000, 1000):
track_ID = track_to_hotttnesss_ordered[i][0]
hotttnesss = track_to_hotttnesss_ordered[i][1]
out = !grep "$track_ID" "$MSD_ADD"/unique_tracks.txt
print out[0].strip().split('<SEP>')[2:4], 'Hotttnesss:', hotttnesss
In [6]:
# and see how the hotttnesss are distributed
hist(track_to_hotttnesss.values(), bins=20)
pass
In [7]:
def get_tracks(filename):
tracks = list()
with open(filename, 'rb') as f:
for line in f:
tracks.append(line.split('\t')[0].strip())
return tracks
In [8]:
# these 2 files are created in processLastfmTags.ipynb
train_tracks = get_tracks('tracks_tag_train.num')
test_tracks = get_tracks('tracks_tag_test.num')
In [9]:
train_track_to_hotttnesss = dict((track, track_to_hotttnesss[track])
for track in filter(lambda x: x in track_to_hotttnesss, train_tracks))
In [10]:
hist(train_track_to_hotttnesss.values(), bins=20)
pass
In [11]:
# randomly select 24000 non-zero-hotttnesss tracks and 1000 zeros-hotttnesss tracks from the training split
np.random.seed(98765)
tracks_nzhotttnesss = np.random.choice(filter(lambda x: train_track_to_hotttnesss[x] != 0.0, train_track_to_hotttnesss.keys()),
size=24000, replace=False)
tracks_zhotttnesss = np.random.choice(filter(lambda x: train_track_to_hotttnesss[x] == 0.0, train_track_to_hotttnesss.keys()),
size=1000, replace=False)
tracks_VQ = np.hstack((tracks_nzhotttnesss, tracks_zhotttnesss))
In [12]:
def data_generator(msd_data_root, tracks, shuffle=True, ext='.h5'):
if shuffle:
np.random.shuffle(tracks)
for track_ID in tracks:
track_dir = os.path.join(msd_data_root, '/'.join(track_ID[2:5]), track_ID + ext)
h5 = hdf5_getters.open_h5_file_read(track_dir)
mfcc = hdf5_getters.get_segments_timbre(h5)
h5.close()
if shuffle:
np.random.shuffle(mfcc)
yield mfcc
In [13]:
def build_codewords(msd_data_root, tracks, cluster=None, n_clusters=2, max_iter=10, random_state=None):
if type(random_state) is int:
np.random.seed(random_state)
elif random_state is not None:
np.random.setstate(random_state)
if cluster is None:
cluster = HartiganOnline.HartiganOnline(n_clusters=n_clusters)
for i in xrange(max_iter):
print 'Iteration %d: passing through the data...' % (i+1)
for d in data_generator(msd_data_root, tracks):
cluster.partial_fit(d)
return cluster
In [ ]:
K = 512
cluster = build_codewords(MSD_DATA_ROOT, tracks_VQ, n_clusters=K, max_iter=3, random_state=98765)
In [17]:
figure(figsize=(22, 4))
imshow(cluster.cluster_centers_.T, cmap=cm.PuOr_r, aspect='auto', interpolation='nearest')
colorbar()
Out[17]:
In [ ]:
with open('Codebook_K%d_Hartigan.cPickle' % K, 'wb') as f:
pickle.dump(cluster, f)
In [36]:
with open('Codebook_K%d_Hartigan.cPickle' % K, 'rb') as f:
cluster = pickle.load(f)
vq = VectorQuantizer.VectorQuantizer(clusterer=cluster)
vq.center_norms_ = 0.5 * (vq.clusterer.cluster_centers_**2).sum(axis=1)
vq.components_ = vq.clusterer.cluster_centers_
In [136]:
def quantize_and_save(vq, K, msd_data_root, track_ID):
track_dir = os.path.join(msd_data_root, '/'.join(track_ID[2:5]), track_ID + '.h5')
h5 = hdf5_getters.open_h5_file_read(track_dir)
mfcc = hdf5_getters.get_segments_timbre(h5)
h5.close()
vq_hist = vq.transform(mfcc).sum(axis=0).astype(np.int16)
tdir = os.path.join('vq_hist', '/'.join(track_ID[2:5]))
if not os.path.exists(tdir):
os.makedirs(tdir)
np.save(os.path.join(tdir, track_ID + '_K%d' % K), vq_hist)
pass
In [ ]:
n_jobs = 5
Parallel(n_jobs=n_jobs)(delayed(quantize_and_save)(vq, K, MSD_DATA_ROOT, track_ID)
for track_ID in itertools.chain(train_tracks, test_tracks))
In [ ]: