In [123]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score, make_scorer, label_ranking_loss
from scipy.sparse import lil_matrix, issparse
import matplotlib.pyplot as plt
import seaborn as sns
In [156]:
sys.path.append('src')
from TopPushMLC import TopPushMLC
from evaluate import evaluatePrecision, evalPred, avgPrecisionK
from BinaryRelevance import BinaryRelevance
In [3]:
data_dir = 'data'
faotm = os.path.join(data_dir, 'aotm-2011/aotm-2011-subset.pkl')
#fmap = os.path.join(data_dir, 'aotm-2011/songID2TrackID.pkl')
ffeature = os.path.join(data_dir, 'msd/songID2Features.pkl')
In [16]:
fx = os.path.join(data_dir, 'aotm-2011/X_audio.pkl')
fy = os.path.join(data_dir, 'aotm-2011/Y_audio.pkl')
fxtrain = os.path.join(data_dir, 'aotm-2011/X_train_audio.pkl')
fytrain = os.path.join(data_dir, 'aotm-2011/Y_train_audio.pkl')
fxtest = os.path.join(data_dir, 'aotm-2011/X_test_audio.pkl')
fytest = os.path.join(data_dir, 'aotm-2011/Y_test_audio.pkl')
Load playlists.
In [5]:
playlists = pkl.load(open(faotm, 'rb'))
In [6]:
print('#Playlists: %d' % len(playlists))
In [7]:
playlists[0]
Out[7]:
In [8]:
#print('#Songs: %d' % len({songID for p in playlists for songID in p['filtered_lists'][0]}))
In [9]:
#lengths = [len(p['filtered_lists'][0]) for p in playlists]
lengths = [len(sl) for sl in playlists]
plt.hist(lengths, bins=20)
print('Average playlist length: %.1f' % np.mean(lengths))
Load song_id
--> track_id
mapping: a song may correspond to multiple tracks.
In [ ]:
#song2TrackID = pkl.load(open(fmap, 'rb'))
In [ ]:
#{ k : song2TrackID[k] for k in list(song2TrackID.keys())[:10] }
Load song_id
--> feature array
mapping: map a song to the audio features of one of its corresponding tracks in MSD.
In [10]:
song2Features = pkl.load(open(ffeature, 'rb'))
The set of songs, which is the set of labels in this formulation.
In [11]:
song_set = sorted(song2Features.keys())
In [12]:
len(song_set)
Out[12]:
In [13]:
label_indices = {songID: ix for ix, songID in enumerate(song_set)}
In [14]:
list(label_indices.items())[:10]
Out[14]:
In [101]:
def gen_training_set(playlists=playlists, label_indices=label_indices, features=song2Features):
"""
Create the labelled dataset for a given song index
Input:
- playlists: which playlists to create features for
- label_indices: a dictionary that maps a songID to the index of the corresponding label
- features: a dictionary that maps a songID to its feature vector
Output:
- (Feature, Label) pair (X, Y), with # num playlists rows
X comprises the features for each seed song
Y comprises the indicators of whether the given song is present in the respective playlist
"""
N = len(playlists)
K = len(label_indices)
X = [ ]
Y = lil_matrix((N, K), dtype=np.int8)
cnt = 0
for i in range(len(playlists)):
cnt += 1
if cnt % 10 == 0:
sys.stdout.write('\r%d / %d' % (cnt, len(playlists)))
sys.stdout.flush()
playlist = playlists[i]
seed = playlist[0]
X.append(features[seed])
#indices = [label_indices[s] for s in playlist]
indices = [label_indices[s] for s in playlist if s in label_indices]
Y[i, indices] = 1
return np.array(X), Y.tocsr()
In [100]:
test_dict = {1: 0, 2: 1, 3: 2}
[test_dict[s] for s in [1, 2, 5] if s in test_dict]
Out[100]:
Train a logistic regression model for each label.
In [17]:
if np.all([os.path.exists(fname) for fname in [fxtrain, fytrain, fxtest, fytest]]):
X_train = pkl.load(open(fxtrain, 'rb'))
Y_train = pkl.load(open(fytrain, 'rb'))
X_test = pkl.load(open(fxtest, 'rb'))
Y_test = pkl.load(open(fytest, 'rb'))
else:
X, Y = gen_training_set(playlists=playlists, label_indices=label_indices, features=song2Features)
# by fixing random seed, the same playlists will be in the test set each time
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=31)
pkl.dump(X, open(fx, 'wb'))
pkl.dump(Y, open(fy, 'wb'))
pkl.dump(X_train, open(fxtrain, 'wb'))
pkl.dump(Y_train, open(fytrain, 'wb'))
pkl.dump(X_test, open(fxtest, 'wb'))
pkl.dump(Y_test, open(fytest, 'wb'))
In [18]:
X_train.shape
Out[18]:
In [19]:
X_test.shape
Out[19]:
In [20]:
Y_train.shape
Out[20]:
In [21]:
Y_test.shape
Out[21]:
Feature normalisation.
In [22]:
X.shape
Out[22]:
In [23]:
Y.shape
Out[23]:
In [84]:
songcnts = np.asarray(Y.sum(axis=0).tolist()[0])
In [85]:
np.max(songcnts)
Out[85]:
In [86]:
indices = np.argsort(songcnts)
In [87]:
indices = np.asarray(indices[::-1])
In [90]:
songcnts[indices[0]]
Out[90]:
In [89]:
songcnts[indices[-1]]
Out[89]:
In [92]:
size=1000
plt.plot(np.arange(size), songcnts[indices[:size]])
Out[92]:
In [95]:
type1_songs = np.asarray(song_set)[indices[:200]]
In [102]:
type1_song_features = {sid: song2Features[sid] for sid in type1_songs}
In [104]:
type1_song_label_indices = {sid: ix for ix, sid in enumerate(type1_songs)}
In [110]:
playlist_subset = [pl for pl in playlists if pl[0] in type1_song_label_indices]
In [136]:
X1, Y1 = gen_training_set(playlists=playlist_subset, label_indices=type1_song_label_indices, \
features=type1_song_features)
Fitering out playlists with only one song.
In [137]:
Y1 = Y1.toarray()
In [138]:
ind1 = Y1.sum(axis=1) > 1
In [139]:
X1, Y1 = X1[ind1], Y1[ind1]
In [140]:
Y1.shape
Out[140]:
Length histogram.
In [162]:
pd.Series(np.sum(Y1, axis=1)).hist()
Out[162]:
Popularity histogram.
In [163]:
pd.Series(np.sum(Y1, axis=0)).hist()
Out[163]:
In [141]:
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size=0.33, random_state=7)
In [142]:
X1_train_mean = np.mean(X1_train, axis=0).reshape((1, -1))
X1_train_std = np.std(X1_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X1_train -= X1_train_mean
X1_train /= X1_train_std
X1_test -= X1_train_mean
X1_test /= X1_train_std
In [143]:
Y1_train.shape
Out[143]:
Train.
In [144]:
ranges = range(-6, 7)
#ranges = range(-6, 5)
parameters = [{'C': sorted([10**(e) for e in ranges] + [3 * 10**(e) for e in ranges]),
'r': [0.5, 1, 2, 4]}]
scorer = {'Prec': make_scorer(avgPrecisionK)}
In [146]:
clf1 = GridSearchCV(TopPushMLC(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='Prec')
clf1.fit(X1_train, Y1_train)
Out[146]:
In [159]:
br1 = GridSearchCV(BinaryRelevance(), param_grid=[{'C': parameters[0]['C']}], scoring=scorer, \
cv=5, n_jobs=4, refit='Prec')
br1.fit(X1_train, Y1_train)
Out[159]:
In [148]:
print('TP1:')
evaluatePrecision(Y1_test, clf1.decision_function(X1_test))
Out[148]:
In [160]:
print('BR1:')
evaluatePrecision(Y1_test, br1.decision_function(X1_test))
Out[160]:
In [164]:
type2_songs = np.asarray(song_set)[indices[200:400]]
In [165]:
type2_song_features = {sid: song2Features[sid] for sid in type2_songs}
In [166]:
type2_song_label_indices = {sid: ix for ix, sid in enumerate(type2_songs)}
In [167]:
playlist_subset2 = [pl for pl in playlists if pl[0] in type2_song_label_indices]
In [168]:
X2, Y2 = gen_training_set(playlists=playlist_subset2, label_indices=type2_song_label_indices, \
features=type2_song_features)
Fitering out playlists with only one song.
In [169]:
Y2 = Y2.toarray()
In [170]:
ind2 = Y2.sum(axis=1) > 1
In [171]:
X2, Y2 = X2[ind2], Y2[ind2]
In [172]:
Y2.shape
Out[172]:
Length histogram.
In [173]:
pd.Series(np.sum(Y2, axis=1)).hist()
Out[173]:
Popularity histogram.
In [174]:
pd.Series(np.sum(Y2, axis=0)).hist()
Out[174]:
In [175]:
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size=0.33, random_state=7)
In [176]:
X2_train_mean = np.mean(X2_train, axis=0).reshape((1, -1))
X2_train_std = np.std(X2_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X2_train -= X2_train_mean
X2_train /= X2_train_std
X2_test -= X2_train_mean
X2_test /= X2_train_std
Train.
In [177]:
clf2 = GridSearchCV(TopPushMLC(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='Prec')
clf2.fit(X2_train, Y2_train)
Out[177]:
In [179]:
br2 = GridSearchCV(BinaryRelevance(), param_grid=[{'C': parameters[0]['C']}], scoring=scorer, \
cv=5, n_jobs=4, refit='Prec')
br2.fit(X2_train, Y2_train)
Out[179]:
In [178]:
print('TP2:')
evaluatePrecision(Y2_test, clf2.decision_function(X2_test))
Out[178]:
In [180]:
print('BR2:')
evaluatePrecision(Y2_test, br2.decision_function(X2_test))
Out[180]:
In [ ]:
def print_results(predictor, X_train, Y_train, X_test, Y_test, trainPerf=False):
"""
Compute and save performance results
"""
batch_size = 500
njobs = 16
p3_test = []
p5_test = []
pk_test = []
p10_test = []
#rankloss_test = []
N_test = X_test.shape[0]
N_batch_test = int((N_test-1) / batch_size) + 1
for i in range(N_batch_test):
sys.stdout.write('\r%d / %d' % (i+1, N_batch_test)); sys.stdout.flush()
ix0 = i * batch_size
ix1 = min((i+1) * batch_size, N_test)
preds = predictor.decision_function(X_test[ix0:ix1])
evaldict = evaluatePrecision(Y_test[ix0:ix1].toarray(), preds, verbose=-1, n_jobs=njobs)
size = ix1 - ix0
p3_test.append(evaldict['Precision@3'][0] * size)
p5_test.append(evaldict['Precision@5'][0] * size)
pk_test.append(evaldict['Precision@K'][0] * size)
p10_test.append(evaldict['Precision@10'][0] * size)
#rankloss_test.append(evalPred1(Y_test[i].toarray()[0], pred, metricType='Ranking'))
print()
print('Test set:')
print('Precision@3:', (np.sum(p3_test) / N_test))
print('Precision@5:', (np.sum(p5_test) / N_test))
print('Precision@k:', (np.sum(pk_test) / N_test))
print('Precision@10:', (np.sum(p10_test) / N_test))
print()
if trainPerf is True:
p3_train = []
p5_train = []
pk_train = []
p10_train = []
#rankloss_train = []
N_train = X_train.shape[0]
N_batch_train = int((N_train-1) / batch_size) + 1
for i in range(N_batch_train):
sys.stdout.write('\r%d / %d' % (i+1, N_batch_train)); sys.stdout.flush()
ix0 = i * batch_size
ix1 = min((i+1) * batch_size, N_train)
preds = predictor.decision_function(X_train[ix0:ix1])
evaldict = evaluatePrecision(Y_train[ix0:ix1].toarray(), preds, verbose=-1, n_jobs=njobs)
size = ix1 - ix0
p3_train.append(evaldict['Precision@3'][0] * size)
p5_train.append(evaldict['Precision@5'][0] * size)
pk_train.append(evaldict['Precision@K'][0] * size)
p10_train.append(evaldict['Precision@10'][0] * size)
#rankloss_train.append(evalPred1(Y_train[i].toarray()[0], pred, metricType='Ranking'))
print()
print('Training set:')
print('Precision@3:', (np.sum(p3_train) / N_train))
print('Precision@5:', (np.sum(p5_train) / N_train))
print('Precision@k:', (np.sum(pk_train) / N_train))
print('Precision@10:', (np.sum(p10_train) / N_train))
#print()
#print('Training set:')
#print('RankingLoss: %.1f, %.1f' % (np.mean(rankloss_train), np.std(rankloss_train) / N_train))
#print()
#print('Test set:')
#print('RankingLoss: %.1f, %.1f' % (np.mean(rankloss_test), np.std(rankloss_test) / N_test))
In [ ]:
def print_dataset_info(X_train, Y_train, X_test, Y_test):
N_train, D = X_train.shape
K = Y_train.shape[1]
N_test = X_test.shape[0]
print('%-45s %s' % ('Number of training examples:', '{:,}'.format(N_train)))
print('%-45s %s' % ('Number of test examples:', '{:,}'.format(N_test)))
print('%-45s %s' % ('Number of features:', '{:,}'.format(D)))
print('%-45s %s' % ('Number of labels:', '{:,}'.format(K)))
avgK_train = np.mean(np.sum(Y_train, axis=1))
avgK_test = np.mean(np.sum(Y_test, axis=1))
print('%-45s %.3f (%.3f%%)' % ('Average number of positive labels (train):', avgK_train, 100*avgK_train / K))
print('%-45s %.3f (%.3f%%)' % ('Average number of positive labels (test):', avgK_test, 100*avgK_test / K))
#print('%-45s %.4f%%' % ('Average label occurrence (train):', np.mean(np.sum(Y_train, axis=0)) / N_train))
#print('%-45s %.4f%%' % ('Average label occurrence (test):', np.mean(np.sum(Y_test, axis=0)) / N_test))
print('%-45s %.3f%%' % ('Sparsity (percent) (train):', 100 * np.sum(Y_train) / np.prod(Y_train.shape)))
print('%-45s %.3f%%' % ('Sparsity (percent) (test):', 100 * np.sum(Y_test) / np.prod(Y_test.shape)))
In [ ]:
print_dataset_info(X_train, Y_train, X_test, Y_test)
In [ ]:
clf_ = TopPushMLC(C=10000, r=2)
clf_.fit_SGD(X_train, Y_train, batch_size=500, n_epochs=10, learning_rate=0.05)
In [ ]:
print_results(clf_, X_train, Y_train, X_test, Y_test)