In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score, make_scorer, label_ranking_loss
from scipy.sparse import lil_matrix, issparse
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
sys.path.append('src')
from TopPushMLC import TopPushMLC
from evaluate import evaluatePrecision, evalPred
In [3]:
data_dir = 'data'
faotm = os.path.join(data_dir, 'aotm-2011/aotm-2011-subset.pkl')
fmap = os.path.join(data_dir, 'aotm-2011/songID2TrackID.pkl')
ftag = os.path.join(data_dir, 'msd/msd_tagtraum_cd2c.cls')
In [4]:
fxtrain = os.path.join(data_dir, 'aotm-2011/XTrain_tag.pkl')
fytrain = os.path.join(data_dir, 'aotm-2011/YTrain.pkl')
fxtest = os.path.join(data_dir, 'aotm-2011/XTest_tag.pkl')
fytest = os.path.join(data_dir, 'aotm-2011/YTest.pkl')
Load playlists.
In [5]:
playlists = pkl.load(open(faotm, 'rb'))
In [6]:
print('#Playlists: %d' % len(playlists))
In [7]:
playlists[0]
Out[7]:
In [8]:
#print('#Songs: %d' % len({songID for p in playlists for songID in p['filtered_lists'][0]}))
In [9]:
#lengths = [len(p['filtered_lists'][0]) for p in playlists]
lengths = [len(sl) for sl in playlists]
plt.hist(lengths, bins=20)
print('Average playlist length: %.1f' % np.mean(lengths))
Load song_id
--> track_id
mapping: a song may correspond to multiple tracks.
In [10]:
song2TrackID = pkl.load(open(fmap, 'rb'))
In [11]:
{ k : song2TrackID[k] for k in list(song2TrackID.keys())[:10] }
Out[11]:
Load song tags, build track_id
--> tag
mapping.
In [12]:
track2Tags = dict()
In [13]:
with open(ftag) as f:
for line in f:
if line[0] == '#': continue
tid, tag = line.strip().split('\t')
#print(tid, tag)
track2Tags[tid] = tag
In [14]:
print('#(Track, Tag): %d' % len(track2Tags))
In [15]:
{ k : track2Tags[k] for k in list(track2Tags.keys())[:10] }
Out[15]:
Use the subset of playlist such that the first song (i.e. the seed song) in each playlist has tag(s).
In [16]:
subset_ix = []
In [17]:
seedSong2Tag = { }
for ix in range(len(playlists)):
# the list of song IDs in the playlist
#songIDs = playlists[ix]['filtered_lists'][0]
songIDs = playlists[ix]
# seed song
seedSongID = songIDs[0]
seedTrackIDs = song2TrackID[seedSongID]
# a song can have multiple tracks, make sure that at least one track for a song has a tag
flag = [ (trackID in track2Tags) for trackID in seedTrackIDs]
if not np.any(flag):
continue
#seedSong2Tag[playlists[ix]['mix_id']] = [track2Tags[seedTrackIDs[i]] for i in range(len(flag)) if flag[i] is True]
seedSong2Tag[playlists[ix][0]] = [track2Tags[seedTrackIDs[i]] for i in range(len(flag)) if flag[i] is True]
subset_ix.append(ix)
In [18]:
#seedSong2Tag
In [19]:
playlists_subset = [playlists[ix] for ix in subset_ix]
In [20]:
print('#Playlists used: %d' % len(subset_ix))
In [21]:
playlists_subset[0]
Out[21]:
The set of unique songs, in multilabel learning, we have a label for each song in this set.
In [22]:
song_set = sorted({songID for p in playlists_subset for songID in p})
In [23]:
print('#Songs used: %d' % len(song_set))
In [24]:
print(song_set[:10])
For the most part, playlists contain less than 10 songs. The most common playlist length is 2 songs.
In [25]:
playlist_lengths = [len(p) for p in playlists_subset]
plt.hist(playlist_lengths, bins=20)
print('Average playlist length: %.1f' % np.mean(playlist_lengths))
Indicator of tags: tag
--> index
mapping.
In [26]:
# the set of unique tags
tag_set = sorted(set(track2Tags.values()))
In [27]:
print('#Tags: %d' % len(tag_set))
In [28]:
tag_indicator = { tag: ix for ix, tag in enumerate(tag_set) }
In [29]:
tag_indicator
Out[29]:
Build features (1-hot encoding of tag) for a song given its song_id
.
In [30]:
def gen_features(song_id, song2TrackID = song2TrackID, tag_indicator = tag_indicator):
"""
Generate one-hot feature vector for a given song ID
"""
features = np.zeros(len(tag_set), dtype = np.float)
trackIDs = song2TrackID[song_id]
cnt = 0
for trackID in trackIDs:
if trackID in track2Tags:
cnt += 1
tag = track2Tags[trackID]
tag_ix = tag_indicator[tag]
features[tag_ix] = 1
# must have at least one tag for the song, else useless
assert(cnt >= 1)
return features
In [31]:
def gen_feature_map(song_id, seed):
"""
Generate feature mapping for a given (label, query) pair
"""
#return gen_features(song_id) - gen_features(seed) # feature map
return gen_features(seed) # a trivial feature map
In [32]:
def gen_training_set(playlists = playlists_subset, song_set = song_set):
"""
Create the labelled dataset for a given song index
Input:
- playlists: which playlists to create features for
Output:
- (Feature, Label) pair (X, Y), with # num playlists rows
X comprises the features for each seed song and the given song
Y comprises the indicators of whether the given song is present in the respective playlist
"""
N = len(playlists)
D = len(tag_set)
K = len(song_set)
X = np.zeros((N, D), dtype = np.float)
#Y = np.zeros((N, K), dtype = np.int)
#Y = coo_matrix(([0], ([0],[0])), shape=(N, K), dtype=np.int8).tolil()
Y = lil_matrix((N, K), dtype=np.int8)
for i in range(len(playlists)):
playlist = playlists[i]
seed = playlist[0]
X[i, :] = gen_feature_map(None, seed)
Y[i, :] = [int(sid in playlist) for sid in song_set]
return X, Y.tocsr()
In [33]:
gen_feature_map(song_set[100], playlists_subset[0][0])
Out[33]:
Train a logistic regression model for each label.
In [34]:
if np.all([os.path.exists(fname) for fname in [fxtrain, fytrain, fxtest, fytest]]):
X_train = pkl.load(open(fxtrain, 'rb'))
Y_train = pkl.load(open(fytrain, 'rb'))
X_test = pkl.load(open(fxtest, 'rb'))
Y_test = pkl.load(open(fytest, 'rb'))
else:
X, Y = gen_training_set()
# by fixing random seed, the same playlists will be in the test set each time
X_train, X_test, Y_train, Y_test = sk.model_selection.train_test_split(X, Y, test_size=0.33, random_state=31)
pkl.dump(X_train, open(fxtrain, 'wb'))
pkl.dump(Y_train, open(fytrain, 'wb'))
pkl.dump(X_test, open(fxtest, 'wb'))
pkl.dump(Y_test, open(fytest, 'wb'))
In [35]:
X_train.shape
Out[35]:
In [36]:
Y_train.shape
Out[36]:
In [37]:
#clf = OneVsRestClassifier(LogisticRegression(verbose=1))
#clf.fit(X_train, Y_train)
#pkl.dump(clf, open(os.path.join(data_dir, 'aotm-2011/br-base.pkl'), 'wb'))
In [69]:
def print_results(predictor, X_train, Y_train, X_test, Y_test):
"""
Compute and save performance results
"""
p3_train = []
p5_train = []
pk_train = []
p10_train = []
p3_test = []
p5_test = []
pk_test = []
p10_test = []
rankloss_train = []
rankloss_test = []
N_train = X_train.shape[0]
batch_size = 500
N_batch_train = int((N_train-1) / batch_size) + 1
for i in range(N_batch_train):
ix0 = i * batch_size
ix1 = min((i+1) * batch_size, N_train)
preds = predictor.decision_function(X_train[ix0:ix1])
evaldict = evaluatePrecision(Y_train[ix0:ix1].toarray(), preds, verbose=-1)
size = ix1 - ix0
p3_train.append(evaldict['Precision@3'][0] * size)
p5_train.append(evaldict['Precision@5'][0] * size)
pk_train.append(evaldict['Precision@K'][0] * size)
p10_train.append(evaldict['Precision@10'][0] * size)
#rankloss_train.append(evalPred1(Y_train[i].toarray()[0], pred, metricType='Ranking'))
sys.stdout.write('\r%d / %d' % (i+1, N_batch_train)); sys.stdout.flush()
print()
N_test = X_test.shape[0]
N_batch_test = int((N_test-1) / batch_size) + 1
for i in range(N_batch_test):
ix0 = i * batch_size
ix1 = min((i+1) * batch_size, N_test)
preds = predictor.decision_function(X_test[ix0:ix1])
evaldict = evaluatePrecision(Y_test[ix0:ix1].toarray(), preds, verbose=-1)
size = ix1 - ix0
p3_test.append(evaldict['Precision@3'][0] * size)
p5_test.append(evaldict['Precision@5'][0] * size)
pk_test.append(evaldict['Precision@K'][0] * size)
p10_test.append(evaldict['Precision@10'][0] * size)
#rankloss_test.append(evalPred1(Y_test[i].toarray()[0], pred, metricType='Ranking'))
sys.stdout.write('\r%d / %d' % (i+1, N_batch_test)); sys.stdout.flush()
print()
print('Training set:')
print('Precision@3:', (np.sum(p3_train) / N_train))
print('Precision@5:', (np.sum(p5_train) / N_train))
print('Precision@k:', (np.sum(pk_train) / N_train))
print('Precision@10:', (np.sum(p10_train) / N_train))
print()
print('Test set:')
print('Precision@3:', (np.sum(p3_test) / N_test))
print('Precision@5:', (np.sum(p5_test) / N_test))
print('Precision@k:', (np.sum(pk_test) / N_test))
print('Precision@10:', (np.sum(p10_test) / N_test))
#print()
#print('Training set:')
#print('RankingLoss: %.1f, %.1f' % (np.mean(rankloss_train), np.std(rankloss_train) / N_train))
#print()
#print('Test set:')
#print('RankingLoss: %.1f, %.1f' % (np.mean(rankloss_test), np.std(rankloss_test) / N_test))
In [100]:
def print_dataset_info(X_train, Y_train, X_test, Y_test):
N_train, D = X_train.shape
K = Y_train.shape[1]
N_test = X_test.shape[0]
print('%-45s %s' % ('Number of training examples:', '{:,}'.format(N_train)))
print('%-45s %s' % ('Number of test examples:', '{:,}'.format(N_test)))
print('%-45s %s' % ('Number of features:', '{:,}'.format(D)))
print('%-45s %s' % ('Number of labels:', '{:,}'.format(K)))
avgK_train = np.mean(np.sum(Y_train, axis=1))
avgK_test = np.mean(np.sum(Y_test, axis=1))
print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (train):', avgK_train, 100*avgK_train / K))
print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (test):', avgK_test, 100*avgK_test / K))
#print('%-45s %.4f%%' % ('Average label occurrence (train):', np.mean(np.sum(Y_train, axis=0)) / N_train))
#print('%-45s %.4f%%' % ('Average label occurrence (test):', np.mean(np.sum(Y_test, axis=0)) / N_test))
print('%-45s %.3f%%' % ('Sparsity (percent) (train):', 100 * np.sum(Y_train) / np.prod(Y_train.shape)))
print('%-45s %.3f%%' % ('Sparsity (percent) (test):', 100 * np.sum(Y_test) / np.prod(Y_test.shape)))
In [101]:
print_dataset_info(X_train, Y_train, X_test, Y_test)
In [70]:
clf = pkl.load(open(os.path.join(data_dir, 'aotm-2011/br-base.pkl'), 'rb'))
print_results(clf, X_train, Y_train, X_test, Y_test)
In [83]:
y1 = clf.decision_function(X_train[:1])[0]
In [84]:
Y_train[0].toarray().nonzero()
Out[84]:
In [98]:
clf2 = TopPushMLC(C=1, r=1)
clf2.fit_SGD(X_train, Y_train, batch_size=500, n_epochs=10, learning_rate=0.01)
In [99]:
print_results(clf2, X_train, Y_train, X_test, Y_test)
In [ ]: