In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys
import gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from scipy.sparse import lil_matrix, issparse
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
sys.path.append('src')
from BinaryRelevance import BinaryRelevance
#from PClassificationMLC import PClassificationMLC
from PCMLC import PCMLC as PClassificationMLC
from evaluate import calc_F1, calc_precisionK, calc_rank, f1_score_nowarn
In [3]:
data_dir = 'data/aotm-2011'
#faotm = os.path.join(data_dir, 'aotm2011-subset.pkl')
faotm = os.path.join(data_dir, 'aotm2011-user-playlist.pkl')
ffeature = 'data/msd/songID2Features.pkl.gz'
fgenre = 'data/msd/song2genre.pkl'
Load playlists.
In [4]:
user_playlists = pkl.load(open(faotm, 'rb'))
In [5]:
print('#user :', len(user_playlists))
print('#playlist:', np.sum([len(user_playlists[u]) for u in user_playlists]))
In [6]:
pl_lengths = [len(pl) for u in user_playlists for pl in user_playlists[u]]
#plt.hist(pl_lengths, bins=100)
print('Average playlist length: %.1f' % np.mean(pl_lengths))
In [7]:
users = sorted(user_playlists.keys())
In [8]:
songs_user = {u: {sid for pl in user_playlists[u] for sid in pl} for u in users} # user: a set of songs
Compute the number of playlists per user, and the number of songs covered by the user's playlists.
In [9]:
udf = pd.DataFrame(index=users, columns=['#playlist', '#song'])
In [10]:
udf['#playlist'] = [len(user_playlists[u]) for u in users]
In [11]:
udf['#song'] = [len(songs_user[u]) for u in users]
In [12]:
ax = plt.subplot(111)
udf['#playlist'].hist(bins=200, ax=ax)
ax.set_yscale('log')
In [13]:
u_npl = sorted([(u, len(user_playlists[u])) for u in users], key=lambda x: x[1])
In [14]:
#u_npl
In [15]:
step = 1000 # sample 0.1%
subset = [u_npl[ix] for ix in np.arange(0, len(u_npl), step)]
In [16]:
subset
Out[16]:
In [17]:
uid_subset = [t[0] for t in subset]
In [18]:
#udf_subset = udf[ udf['#playlist'] == 10]
#udf_subset.head()
In [19]:
#uid_subset += udf_subset.index[[3,4]].tolist()
In [20]:
#udf_subset = udf[ udf['#playlist'] == 30]
#udf_subset.head()
In [21]:
#uid_subset += udf_subset.index[[1,2]].tolist()
In [22]:
#udf_subset = udf[ udf['#playlist'].isin(np.arange(95, 105))]
#udf_subset.sort_values(by='#playlist')
In [23]:
#uid_subset += udf_subset.index[[2,5]].tolist()
In [24]:
#udf.sort_values(by=['#playlist'], ascending=False).iloc[100:200]
In [25]:
#uid_subset
In [26]:
#udf[uid_subset] # tuple are used as multiindex in pandas
#udf[[uid_subset]]
The user whose playlists cover a proper number of playlists, e.g. 50.
In [27]:
playlists_subset = [pl for u in uid_subset for pl in user_playlists[u]]
In [28]:
len(playlists_subset)
Out[28]:
In [29]:
song_set = sorted({sid for u in uid_subset for sid in songs_user[u]})
In [30]:
len(song_set)
Out[30]:
Split songs (80/20 split) such that the distributions of song popularity (the number of occurrence in playlists) in training and dev set are similiar.
In [31]:
song_pl_mat = np.zeros((len(song_set), len(playlists_subset)))
songind = {sid: ix for ix, sid in enumerate(song_set)}
for j in range(len(playlists_subset)):
pl = playlists_subset[j]
ind = [songind[sid] for sid in pl]
song_pl_mat[ind, j] = 1
In [32]:
song_pop = np.sum(song_pl_mat, axis=1)
In [33]:
#plt.hist(song_pop, bins=20)
#print()
In [34]:
#np.arange(0, 100, 5)
In [35]:
sortix = np.argsort(song_pop)
step = 5 # 80/20 split
split_ix = np.arange(0, len(song_pop), step)
dev_ix = [sortix[ix] for ix in split_ix]
dev_song_set = [song_set[ix] for ix in dev_ix]
train_song_set = sorted(set(song_set) - set(dev_song_set))
Histogram of song popularity in training set.
In [36]:
train_song_pop = [song_pop[songind[sid]] for sid in train_song_set]
ax = plt.subplot(111)
ax.hist(train_song_pop, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(train_song_set))
Histogram of song popularity in dev set.
In [37]:
dev_song_pop = [song_pop[songind[sid]] for sid in dev_song_set]
ax = plt.subplot(111)
ax.hist(dev_song_pop, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(dev_song_set))
Split playlists (50/50 split) such that the distributions of playlist length (the number of songs in playlists) for each user in training and dev set are similiar.
In [38]:
train_playlists = []
dev_playlists = []
In [39]:
#np.arange(0, 10, 2)
In [40]:
for u in uid_subset:
u_playlists = user_playlists[u]
if len(u_playlists) < 2:
train_playlists.append((u, u_playlists[0]))
continue
sorted_pl = sorted(u_playlists, key=lambda pl: len(pl))
step = 2
sample_ix = np.arange(0, len(sorted_pl), step)
if len(sample_ix) <= len(sorted_pl) / 2:
dev_ix = sample_ix
else:
dev_ix = [ix for ix in range(len(sorted_pl)) if ix not in sample_ix]
dev_playlists += [(u, sorted_pl[ix]) for ix in dev_ix]
train_playlists += [(u, sorted_pl[ix]) for ix in range(len(sorted_pl)) if ix not in dev_ix]
In [41]:
xmax = np.max([len(pl) for pl in playlists_subset]) + 1
Histogram of playlist length in training set.
In [42]:
ax = plt.subplot(111)
ax.hist([len(t[1]) for t in train_playlists], bins=50)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
print(len(train_playlists))
Histogram of playlist length in training set.
In [43]:
ax = plt.subplot(111)
ax.hist([len(t[1]) for t in dev_playlists], bins=50)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
print(len(dev_playlists))
Split songs such that
In [44]:
num_unknown = len(playlists_subset) * len(dev_song_set) # the number of unknown in setting I
num_dev_song2 = num_unknown / len(dev_playlists)
In [45]:
#np.arange(0, 100, 2.5)
In [46]:
sortix = np.argsort(song_pop)
step = len(song_pop) / (num_dev_song2)
split_ix = np.arange(0, len(song_pop), step)
np.random.seed(123456789)
rounding_prob = step - int(step)
dev_ix = [sortix[ix] for ix in [int(x) if np.random.rand() < rounding_prob or int(x) == len(sortix)-1 \
else int(x)+1 for x in split_ix]] # avoid index out of bounds
dev_song_set2 = [song_set[ix] for ix in dev_ix]
In [47]:
print(num_unknown, len(dev_playlists) * len(dev_song_set2))
Histogram of song popularity in training set (all songs).
In [48]:
ax = plt.subplot(111)
ax.hist(song_pop, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(song_set))
Histogram of song popularity in dev set.
In [49]:
dev_song_pop2 = [song_pop[songind[sid]] for sid in dev_song_set2]
ax = plt.subplot(111)
ax.hist(dev_song_pop2, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(dev_song_set2))
Load song_id
--> feature array
mapping: map a song to the audio features of one of its corresponding tracks in MSD.
In [50]:
song2Features = pkl.load(gzip.open(ffeature, 'rb'))
In [51]:
song2genre = pkl.load(open(fgenre, 'rb'))
Check if all songs have genre info.
In [52]:
np.all([sid in song2genre for sid in song_set])
Out[52]:
Songs as rows, playlists as columns.
In [53]:
def gen_dataset_subset(playlists, song_set, features_MSD, song2genre):
"""
Create labelled dataset: rows are songs, columns are users.
Input:
- playlists: a set of playlists
- song_set: a set of songIDs
- features_MSD: dictionary that maps songIDs to features from MSD
- song2genre: dictionary that maps songIDs to genre
Output:
- (Feature, Label) pair (X, Y)
X: #songs by #features
Y: #songs by #users
"""
song_indices = {sid: ix for ix, sid in enumerate(song_set)}
N = len(song_set)
K = len(playlists)
genre_set = sorted({v for v in song2genre.values()})
genre_indices = {genre: ix for ix, genre in enumerate(genre_set)}
def onehot_genre(songID):
"""
One-hot encoding of genres.
Data imputation: one extra entry for songs without genre info.
Should try other options:
mean imputation, sampling from the distribution of genre popularity.
"""
num = len(genre_set) + 1
vec = np.zeros(num, dtype=np.float)
if songID in song2genre:
genre_ix = genre_indices[song2genre[songID]]
vec[genre_ix] = 1
else:
vec[-1] = 1
return vec
#X = np.array([features_MSD[sid] for sid in song_set]) # without using genre
X = np.array([np.concatenate([features_MSD[sid], onehot_genre(sid)], axis=-1) for sid in song_set])
Y = np.zeros((N, K), dtype=np.bool)
for k in range(K):
pl = playlists[k]
indices = [song_indices[sid] for sid in pl if sid in song_indices]
Y[indices, k] = True
return X, Y
In [54]:
def mean_normalised_reciprocal_rank(Y_true, Y_pred):
"""
Compute the mean of normalised reciprocal rank (reciprocal rank are normalised by the best possible ranks)
"""
normalised_reci_rank = []
npos = np.sum(Y_true, axis=0)
for k in range(Y_true.shape[1]):
ranks = calc_rank(Y_pred[:, k])[Y_true[:, k]]
if len(ranks) > 0:
ideal = np.sum([1./nk for nk in range(1, npos[k]+1)])
real = np.sum([1./r for r in ranks])
normalised_reci_rank.append(real / ideal) # normalise the reciprocal ranks by the best possible ranks
return np.mean(normalised_reci_rank)
In [447]:
def eval_pl(Y_true, Y_pred):
nzcol = np.nonzero(np.sum(Y_true, axis=0))[0] # columns with at least one True
print('Average over %d columns' % len(nzcol))
print('%-15s %.4f' % ('Mean P@K:', np.mean(calc_precisionK(Y_true.T, Y_pred.T))))
print('%-15s %.4f' % ('Mean AUC:', roc_auc_score(Y_true[:, nzcol], Y_pred[:, nzcol], average='macro')))
print('%-15s %.4f' % ('MAP:', average_precision_score(Y_true[:, nzcol], Y_pred[:, nzcol], average='macro')))
print('%-15s %.4f' % ('Mean NRR:', mean_normalised_reciprocal_rank(Y_true, Y_pred)))
In [437]:
playlists1 = [t[1] for t in train_playlists + dev_playlists]
In [438]:
X_train, Y_train = gen_dataset_subset(playlists=playlists1, song_set=train_song_set,
features_MSD=song2Features, song2genre=song2genre)
In [439]:
X_dev, Y_dev = gen_dataset_subset(playlists=playlists1, song_set=dev_song_set,
features_MSD=song2Features, song2genre=song2genre)
Feature normalisation.
In [440]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev -= X_train_mean
X_dev /= X_train_std
In [441]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev : %15s %15s' % (X_dev.shape, Y_dev.shape))
In [442]:
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 1)
In [443]:
#np.sum(Y_train, axis=0)
In [444]:
#np.sum(Y_dev, axis=0)
In [445]:
br = BinaryRelevance(C=1, n_jobs=4)
br.fit(X_train, Y_train)
Evaluation: normalise per playlist.
In [448]:
print('Dev set:')
eval_pl(Y_dev, br.predict(X_dev))
In [449]:
print('Training set:')
eval_pl(Y_train, br.predict(X_train))
In [450]:
#print('Mean macro-F1:', f1_score_nowarn(Y_dev.T, Y_br.T>=0, average='macro'))
In [451]:
#print('P, R, F1:',precision_recall_fscore_support(Y_dev.ravel(), (Y_br>=0).ravel(),average='binary', warn_for=()))
P@K, MAP, NDCG, AUC are good options.
In [452]:
%%script false
min_pos_score = []
for col in range(Y_dev.shape[1]):
val = Y_br[:,col][Y_dev[:,col]]
if len(val) > 0:
min_pos_score.append(np.min(val))
else:
min_pos_score.append(np.nan)
print(np.array(min_pos_score))
In [453]:
%%script false
max_neg_score = []
for col in range(Y_dev.shape[1]):
val = Y_br[:,col][np.logical_not(Y_dev[:,col])]
if len(val) > 0:
max_neg_score.append(np.max(val))
print(np.array(max_neg_score))
In [454]:
#print(np.array(min_pos_score)-np.array(max_neg_score))
P-Classification ~ P-norm push ranking.
In [455]:
pc = PClassificationMLC(C1=1, weighting='labels')
pc.fit(X_train, Y_train)
Evaluation: normalise per playlist.
In [456]:
print('Dev set:')
eval_pl(Y_dev, pc.predict(X_dev))
In [457]:
print('Training set:')
eval_pl(Y_train, pc.predict(X_train))
In [458]:
min_pos_score = []
for col in range(Y_test.shape[1]):
val = Y_pc[:,col][Y_test[:,col]]
if len(val) > 0:
min_pos_score.append(np.min(val))
else:
min_pos_score.append(np.nan)
#plt.hist((np.array(min_pos_score)))
#plt.hist((np.nan_to_num(min_pos_score)), bins=30)
#print(np.array(min_pos_score))
#print()
In [459]:
max_neg_score = []
for col in range(Y_test.shape[1]):
val = Y_pc[:,col][np.logical_not(Y_test[:,col])]
if len(val) > 0:
max_neg_score.append(np.max(val))
#plt.hist(np.array(max_neg_score), bins=30)
#print()
In [460]:
#plt.hist(np.nan_to_num(min_pos_score)-np.array(max_neg_score), bins=30)
#print()
In [538]:
nondev_song_set2 = sorted({sid for sid in song_set if sid not in dev_song_set2})
print(len(nondev_song_set2) + len(dev_song_set2), len(song_set))
In [539]:
playlists2 = [t[1] for t in train_playlists + dev_playlists]
In [542]:
X, Y = gen_dataset_subset(playlists=playlists2, song_set=nondev_song_set2 + dev_song_set2,
features_MSD=song2Features, song2genre=song2genre)
In [543]:
Y_train = Y.copy().astype(np.float) # note: np.nan is float
Y_train[len(nondev_song_set2):, len(train_playlists):] = np.nan
Y_dev = Y[-len(dev_song_set2):, len(train_playlists):]
In [544]:
#Y_train
In [545]:
print(np.sum(np.isnan(Y_train)), len(dev_playlists) * len(dev_song_set2))
In [546]:
X_train = X
Feature normalisation.
In [547]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev = X_train[len(nondev_song_set2):]
In [548]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev : %15s %15s' % (X_dev.shape, Y_dev.shape))
In [549]:
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 1)
In [550]:
#np.sum(Y_train, axis=0)
In [551]:
#np.sum(Y_dev, axis=0)
In [552]:
br2 = BinaryRelevance(C=1, n_jobs=4)
br2.fit(X_train, np.nan_to_num(Y_train))
In [553]:
col_start = -len(dev_playlists)
Evaluation: normalise per playlist.
In [554]:
print('Dev set:')
eval_pl(Y_dev, br2.predict(X_dev)[:, col_start:])
In [555]:
Y_train_gt = np.nan_to_num(Y_train).astype(np.bool)
Y_train_pred = br2.predict(X_train)
Y_train_pred[:, col_start:] = 0 # remove test region
print('Training set:')
eval_pl(Y_train_gt, Y_train_pred)
In [556]:
user_of_playlists2 = [t[0] for t in train_playlists + dev_playlists]
#user_of_playlists2
In [557]:
same_user_mat = np.zeros((len(playlists2), len(playlists2)), dtype=np.bool)
for i in range(len(playlists2)):
for j in range(i+1, len(playlists2)):
if user_of_playlists2[i] == user_of_playlists2[j]:
same_user_mat[i, j] = True
same_user_mat[j, i] = True
In [558]:
#same_user_mat
In [559]:
pla = PClassificationMLC(C1=1, C2=1, C3=10, weighting='both', similarMat=same_user_mat)
pla.fit(X_train, Y_train)
In [560]:
col_start = -len(dev_playlists)
Evaluation: normalise per playlist.
In [561]:
eval_pl(Y_dev, pla.predict(X_dev)[:, col_start:])
In [562]:
Y_train_gt = np.nan_to_num(Y_train).astype(np.bool)
Y_train_pred = pla.predict(X_train)
Y_train_pred[:, col_start:] = 0 # remove test region
eval_pl(Y_train_gt, Y_train_pred)
Check the if the regulariser is effective
In [563]:
%%script false
rows, cols = np.nonzero(same_user_mat)
for row, col in zip(rows, cols):
diff = pla.W[row] - pla.W[col]
print('%g' % np.sqrt(np.dot(pla.W[row], pla.W[row])))
print('%g' % np.sqrt(np.dot(pla.W[col], pla.W[col])))
print('%g' % np.sqrt(np.dot(diff, diff)))
print('------------------------------')
Compute matrix $M$ such that $M_{jk} = \sqrt{(w_j - w_k)^\top (w_j - w_k)}, \forall j, k$.
In [564]:
A = np.dot(pla.W, pla.W.T)
B = np.tile(np.diag(A), (A.shape[0], 1))
M = np.sqrt(-2 * A + (B + B.T))
Normalise $M$ by the vector with maximum norm in $W$.
In [565]:
#aa = np.arange(6).reshape(3, 2)
#np.einsum('ij,ij->i', aa, aa)
In [566]:
denorm = np.sqrt(np.einsum('ij,ij->i', pla.W, pla.W)) # compute the norm for each row in W
In [567]:
M1 = M / np.max(denorm)
In [568]:
plt.matshow(M1)
Out[568]:
In [569]:
user_of_playlists2
Out[569]:
In [530]:
rows, cols = np.nonzero(same_user_mat)
M2 = M1[rows, cols]
print(np.min(M2), np.max(M2), np.mean(M2), np.std(M2))
In [531]:
mat = same_user_mat.copy()
np.fill_diagonal(mat, 1) # remove the diagnoal from consideration
rows, cols = np.where(mat == 0)
M3 = M1[rows, cols]
print(np.min(M3), np.max(M3), np.mean(M3), np.std(M3))
Check performance per user
In [570]:
user_set = sorted(set(user_of_playlists2))
user_set
Out[570]:
In [571]:
#user_of_playlists2
In [572]:
Y_pla = pla.predict(X_dev)[:, col_start:]
dev_col_start = len(train_playlists)
for u in user_set:
uind = np.where(np.array(user_of_playlists2, dtype=np.object) == u)[0]
ntrain = len(uind)
if len(uind) < 2: continue # filtering out users with less than 2 playlists
uind -= dev_col_start
uind = uind[uind >= 0]
ntest = len(uind)
#print(uind)
if len(uind) < 1: continue
print('--------------------')
print('USER:', u)
print('#train: %d, #test: %d' % (ntrain, ntest))
eval_pl(Y_dev[:, uind], Y_pla[:, uind])
print()
In [ ]:
N, K = Y.shape
In [ ]:
Y_nan = Y.copy().astype(np.float)
np.random.seed(8967321)
rand_num = int(0.2 * N)
ones = 0
for k in range(K):
randix = np.random.permutation(np.arange(N))[:rand_num]
Y_nan[randix, k] = np.nan
ones += Y[randix, k].sum()
The number of NaN entries.
In [ ]:
np.sum(np.isnan(Y_nan))
Train: keep running util no overflow warning occurred.
In [ ]:
pc2 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc2.fit(X, Y_nan)
Prediction: use the minimum of positive entry score of the same example as threshold.
Evaluation: use F1 on all unknown entries (as a 1D array).
In [ ]:
Y_pred2 = pc2.predict(X)
In [ ]:
pos_index = np.nan_to_num(Y_nan).astype(np.bool)
nan_index = np.isnan(Y_nan)
In [ ]:
ground_truths = Y[nan_index]
In [ ]:
thresholds = []
preds = []
for k in range(K):
val = Y_pred2[:, k][pos_index[:, k]]
th = np.min(val)
thresholds.append(th)
preds += (Y_pred2[nan_index[:,k], k] >= th).tolist()
In [ ]:
f1_score_nowarn(ground_truths, preds, average='binary')
In [ ]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)
In [ ]:
Y_nan_part2 = Y.copy().astype(np.float)
np.random.seed(8967321)
rand_num = int(0.4 * N)
ones = 0
for k in range(int(K/2), K):
randix = np.random.permutation(np.arange(N))[:rand_num]
Y_nan_part2[randix, k] = np.nan
ones += Y[randix, k].sum()
The number of NaN entries.
In [ ]:
np.sum(np.isnan(Y_nan_part2))
In [ ]:
pc4 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc4.fit(X, Y_nan_part2)
Prediction: use the minimum of positive entry score of the same example as threshold.
Evaluation: use F1 on all unknown entries (as a 1D array).
In [ ]:
Y_pred4 = pc4.predict(X)
In [ ]:
pos_index = np.nan_to_num(Y_nan_part2).astype(np.bool)
nan_index = np.isnan(Y_nan_part2)
In [ ]:
ground_truths = Y[nan_index]
In [ ]:
thresholds = []
preds = []
for k in range(int(K/2), K):
val = Y_pred4[:, k][pos_index[:, k]]
th = np.min(val)
#th = np.mean(val)
thresholds.append(th)
preds += (Y_pred4[nan_index[:,k], k] >= th).tolist()
In [ ]:
f1_score_nowarn(ground_truths, preds, average='binary')
In [ ]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)
In [ ]:
2 * np.sum(np.logical_and(ground_truths, preds)) / (np.sum(ground_truths) + np.sum(preds))