A representative subset of AotM-2011 Playlists with MSD Audio Features



In [1]:

    
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys
import gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from scipy.sparse import lil_matrix, issparse
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns



In [2]:

    
sys.path.append('src')
from BinaryRelevance import BinaryRelevance
#from PClassificationMLC import PClassificationMLC
from PCMLC import PCMLC as PClassificationMLC
from evaluate import calc_F1, calc_precisionK, calc_rank, f1_score_nowarn



In [3]:

    
data_dir = 'data/aotm-2011'
#faotm = os.path.join(data_dir, 'aotm2011-subset.pkl')
faotm = os.path.join(data_dir, 'aotm2011-user-playlist.pkl')
ffeature = 'data/msd/songID2Features.pkl.gz'
fgenre = 'data/msd/song2genre.pkl'

Load playlists

Load playlists.



In [4]:

    
user_playlists = pkl.load(open(faotm, 'rb'))



In [5]:

    
print('#user    :', len(user_playlists))
print('#playlist:', np.sum([len(user_playlists[u]) for u in user_playlists]))









    



#user    : 14182
#playlist: 84710



In [6]:

    
pl_lengths = [len(pl) for u in user_playlists for pl in user_playlists[u]]
#plt.hist(pl_lengths, bins=100)
print('Average playlist length: %.1f' % np.mean(pl_lengths))









    



Average playlist length: 10.1



In [7]:

    
users = sorted(user_playlists.keys())



In [8]:

    
songs_user = {u: {sid for pl in user_playlists[u] for sid in pl} for u in users}  # user: a set of songs

Compute the number of playlists per user, and the number of songs covered by the user's playlists.



In [9]:

    
udf = pd.DataFrame(index=users, columns=['#playlist', '#song'])



In [10]:

    
udf['#playlist'] = [len(user_playlists[u]) for u in users]



In [11]:

    
udf['#song'] = [len(songs_user[u]) for u in users]



In [12]:

    
ax = plt.subplot(111)
udf['#playlist'].hist(bins=200, ax=ax)
ax.set_yscale('log')



In [13]:

    
u_npl = sorted([(u, len(user_playlists[u])) for u in users], key=lambda x: x[1])



In [14]:

    
#u_npl



In [15]:

    
step = 1000  # sample 0.1%
subset = [u_npl[ix] for ix in np.arange(0, len(u_npl), step)]



In [16]:

    
subset









    Out[16]:





[((914763600.0, 'Grey'), 1),
 ((1050674400.0, 'World Class Chris'), 1),
 ((1076418000.0, 'Garick'), 1),
 ((1076418000.0, 'Tom Jones'), 1),
 ((1078923600.0, 'Kurt87'), 1),
 ((1115992800.0, 'm12788'), 1),
 ((1190728800.0, 'EvaninCambridge'), 1),
 ((1058364000.0, 'Meg Usta'), 2),
 ((1092578400.0, 'XxPunk4PeacexX'), 2),
 ((1049205600.0, 'hobbitcore'), 3),
 ((1008421200.0, 'Chris Bowron'), 4),
 ((1079787600.0, 'tragic3rdwoman'), 5),
 ((1069246800.0, 'Axelsson'), 8),
 ((1012482000.0, 'lynn*'), 15),
 ((1009198800.0, 'jarød'), 62)]



In [17]:

    
uid_subset = [t[0] for t in subset]



In [18]:

    
#udf_subset = udf[ udf['#playlist'] == 10]
#udf_subset.head()



In [19]:

    
#uid_subset += udf_subset.index[[3,4]].tolist()



In [20]:

    
#udf_subset = udf[ udf['#playlist'] == 30]
#udf_subset.head()



In [21]:

    
#uid_subset += udf_subset.index[[1,2]].tolist()



In [22]:

    
#udf_subset = udf[ udf['#playlist'].isin(np.arange(95, 105))]
#udf_subset.sort_values(by='#playlist')



In [23]:

    
#uid_subset += udf_subset.index[[2,5]].tolist()



In [24]:

    
#udf.sort_values(by=['#playlist'], ascending=False).iloc[100:200]



In [25]:

    
#uid_subset



In [26]:

    
#udf[uid_subset]  # tuple are used as multiindex in pandas
#udf[[uid_subset]]

Subset of data

The user whose playlists cover a proper number of playlists, e.g. 50.



In [27]:

    
playlists_subset = [pl for u in uid_subset for pl in user_playlists[u]]



In [28]:

    
len(playlists_subset)









    Out[28]:





108



In [29]:

    
song_set = sorted({sid for u in uid_subset for sid in songs_user[u]})



In [30]:

    
len(song_set)









    Out[30]:





786

Split songs for setting I

Split songs (80/20 split) such that the distributions of song popularity (the number of occurrence in playlists) in training and dev set are similiar.



In [31]:

    
song_pl_mat = np.zeros((len(song_set), len(playlists_subset)))
songind = {sid: ix for ix, sid in enumerate(song_set)}
for j in range(len(playlists_subset)):
    pl = playlists_subset[j]
    ind = [songind[sid] for sid in pl]
    song_pl_mat[ind, j] = 1



In [32]:

    
song_pop = np.sum(song_pl_mat, axis=1)



In [33]:

    
#plt.hist(song_pop, bins=20)
#print()



In [34]:

    
#np.arange(0, 100, 5)



In [35]:

    
sortix = np.argsort(song_pop)
step = 5  # 80/20 split
split_ix = np.arange(0, len(song_pop), step)
dev_ix = [sortix[ix] for ix in split_ix]
dev_song_set = [song_set[ix] for ix in dev_ix]
train_song_set = sorted(set(song_set) - set(dev_song_set))

Histogram of song popularity in training set.



In [36]:

    
train_song_pop = [song_pop[songind[sid]] for sid in train_song_set]
ax = plt.subplot(111)
ax.hist(train_song_pop, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(train_song_set))

Histogram of song popularity in dev set.



In [37]:

    
dev_song_pop = [song_pop[songind[sid]] for sid in dev_song_set]
ax = plt.subplot(111)
ax.hist(dev_song_pop, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(dev_song_set))

Split playlists

Split playlists (50/50 split) such that the distributions of playlist length (the number of songs in playlists) for each user in training and dev set are similiar.



In [38]:

    
train_playlists = []
dev_playlists = []



In [39]:

    
#np.arange(0, 10, 2)



In [40]:

    
for u in uid_subset:
    u_playlists = user_playlists[u]
    if len(u_playlists) < 2: 
        train_playlists.append((u, u_playlists[0]))
        continue
    sorted_pl = sorted(u_playlists, key=lambda pl: len(pl))
    step = 2
    sample_ix = np.arange(0, len(sorted_pl), step)
    if len(sample_ix) <= len(sorted_pl) / 2:
        dev_ix = sample_ix
    else:
        dev_ix = [ix for ix in range(len(sorted_pl)) if ix not in sample_ix]
    dev_playlists += [(u, sorted_pl[ix]) for ix in dev_ix]
    train_playlists += [(u, sorted_pl[ix]) for ix in range(len(sorted_pl)) if ix not in dev_ix]



In [41]:

    
xmax = np.max([len(pl) for pl in playlists_subset]) + 1

Histogram of playlist length in training set.



In [42]:

    
ax = plt.subplot(111)
ax.hist([len(t[1]) for t in train_playlists], bins=50)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
print(len(train_playlists))

Histogram of playlist length in training set.



In [43]:

    
ax = plt.subplot(111)
ax.hist([len(t[1]) for t in dev_playlists], bins=50)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
print(len(dev_playlists))

Split songs for setting II

Split songs such that

the number unknown entries is the same as that in setting I.
the distributions of song popularity (the number of occurrence in playlists) in training and dev set are similiar.



In [44]:

    
num_unknown = len(playlists_subset) * len(dev_song_set)  # the number of unknown in setting I
num_dev_song2 = num_unknown / len(dev_playlists)



In [45]:

    
#np.arange(0, 100, 2.5)



In [46]:

    
sortix = np.argsort(song_pop)
step = len(song_pop) / (num_dev_song2)
split_ix = np.arange(0, len(song_pop), step)
np.random.seed(123456789)
rounding_prob = step - int(step)
dev_ix = [sortix[ix] for ix in [int(x) if np.random.rand() < rounding_prob or int(x) == len(sortix)-1 \
                                else int(x)+1 for x in split_ix]]  # avoid index out of bounds
dev_song_set2 = [song_set[ix] for ix in dev_ix]



In [47]:

    
print(num_unknown, len(dev_playlists) * len(dev_song_set2))









    



17064 17101

Histogram of song popularity in training set (all songs).



In [48]:

    
ax = plt.subplot(111)
ax.hist(song_pop, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(song_set))

Histogram of song popularity in dev set.



In [49]:

    
dev_song_pop2 = [song_pop[songind[sid]] for sid in dev_song_set2]
ax = plt.subplot(111)
ax.hist(dev_song_pop2, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(dev_song_set2))

Load song features

Load song_id --> feature array mapping: map a song to the audio features of one of its corresponding tracks in MSD.



In [50]:

    
song2Features = pkl.load(gzip.open(ffeature, 'rb'))

Load genres



In [51]:

    
song2genre = pkl.load(open(fgenre, 'rb'))

Check if all songs have genre info.



In [52]:

    
np.all([sid in song2genre for sid in song_set])









    Out[52]:





False

Create song-playlist matrix

Songs as rows, playlists as columns.



In [53]:

    
def gen_dataset_subset(playlists, song_set, features_MSD, song2genre):
    """
    Create labelled dataset: rows are songs, columns are users.
    
    Input:
        - playlists: a set of playlists
        - song_set: a set of songIDs
        - features_MSD: dictionary that maps songIDs to features from MSD
        - song2genre: dictionary that maps songIDs to genre
    Output:
        - (Feature, Label) pair (X, Y)
          X: #songs by #features
          Y: #songs by #users
    """ 
    song_indices = {sid: ix for ix, sid in enumerate(song_set)}
    N = len(song_set)
    K = len(playlists)
    
    genre_set = sorted({v for v in song2genre.values()})
    genre_indices = {genre: ix for ix, genre in enumerate(genre_set)}
    
    def onehot_genre(songID):
        """
        One-hot encoding of genres.
        Data imputation: one extra entry for songs without genre info.
        Should try other options: 
            mean imputation, sampling from the distribution of genre popularity.
        """
        num = len(genre_set) + 1
        vec = np.zeros(num, dtype=np.float)
        if songID in song2genre:
            genre_ix = genre_indices[song2genre[songID]]
            vec[genre_ix] = 1
        else:
            vec[-1] = 1
        return vec
    
    #X = np.array([features_MSD[sid] for sid in song_set])  # without using genre
    X = np.array([np.concatenate([features_MSD[sid], onehot_genre(sid)], axis=-1) for sid in song_set])
    Y = np.zeros((N, K), dtype=np.bool)
    
    for k in range(K):
        pl = playlists[k]
        indices = [song_indices[sid] for sid in pl if sid in song_indices]
        Y[indices, k] = True

    return X, Y



In [54]:

    
def mean_normalised_reciprocal_rank(Y_true, Y_pred):
    """
    Compute the mean of normalised reciprocal rank (reciprocal rank are normalised by the best possible ranks)
    """
    normalised_reci_rank = []
    npos = np.sum(Y_true, axis=0)
    for k in range(Y_true.shape[1]):
        ranks = calc_rank(Y_pred[:, k])[Y_true[:, k]]
        if len(ranks) > 0:
            ideal = np.sum([1./nk for nk in range(1, npos[k]+1)])
            real = np.sum([1./r for r in ranks])
            normalised_reci_rank.append(real / ideal)  # normalise the reciprocal ranks by the best possible ranks
    return np.mean(normalised_reci_rank)



In [447]:

    
def eval_pl(Y_true, Y_pred):
    nzcol = np.nonzero(np.sum(Y_true, axis=0))[0]  # columns with at least one True
    print('Average over %d columns' % len(nzcol))
    print('%-15s %.4f' % ('Mean P@K:', np.mean(calc_precisionK(Y_true.T, Y_pred.T))))
    print('%-15s %.4f' % ('Mean AUC:', roc_auc_score(Y_true[:, nzcol], Y_pred[:, nzcol], average='macro')))
    print('%-15s %.4f' % ('MAP:', average_precision_score(Y_true[:, nzcol], Y_pred[:, nzcol], average='macro')))
    print('%-15s %.4f' % ('Mean NRR:', mean_normalised_reciprocal_rank(Y_true, Y_pred)))

Setting I: hold a subset of songs, use all playlists



In [437]:

    
playlists1 = [t[1] for t in train_playlists + dev_playlists]



In [438]:

    
X_train, Y_train = gen_dataset_subset(playlists=playlists1, song_set=train_song_set, 
                                      features_MSD=song2Features, song2genre=song2genre)



In [439]:

    
X_dev, Y_dev = gen_dataset_subset(playlists=playlists1, song_set=dev_song_set, 
                                  features_MSD=song2Features, song2genre=song2genre)

Feature normalisation.



In [440]:

    
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev   -= X_train_mean
X_dev   /= X_train_std



In [441]:

    
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))









    



Train:      (628, 218)      (628, 108)
Dev  :      (158, 218)      (158, 108)



In [442]:

    
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 1)









    



-7.44259713994e-17
-0.0459013902717
-0.0302534229084
0.0366010807539



In [443]:

    
#np.sum(Y_train, axis=0)



In [444]:

    
#np.sum(Y_dev, axis=0)

M1. BR - Independent logistic regression



In [445]:

    
br = BinaryRelevance(C=1, n_jobs=4)
br.fit(X_train, Y_train)

Evaluation: normalise per playlist.



In [448]:

    
print('Dev set:')
eval_pl(Y_dev, br.predict(X_dev))









    



Dev set:
Average over 93 columns
Mean P@K:       0.1718
Mean AUC:       0.5594
MAP:            0.0652
Mean NRR:       0.0791



In [449]:

    
print('Training set:')
eval_pl(Y_train, br.predict(X_train))









    



Training set:
Average over 108 columns
Mean P@K:       0.9965
Mean AUC:       1.0000
MAP:            0.9993
Mean NRR:       0.9998



In [450]:

    
#print('Mean macro-F1:', f1_score_nowarn(Y_dev.T, Y_br.T>=0, average='macro'))



In [451]:

    
#print('P, R, F1:',precision_recall_fscore_support(Y_dev.ravel(), (Y_br>=0).ravel(),average='binary', warn_for=()))

P@K, MAP, NDCG, AUC are good options.



In [452]:

    
%%script false
min_pos_score = []
for col in range(Y_dev.shape[1]):
    val = Y_br[:,col][Y_dev[:,col]]
    if len(val) > 0:
        min_pos_score.append(np.min(val))
    else:
        min_pos_score.append(np.nan)
print(np.array(min_pos_score))



In [453]:

    
%%script false
max_neg_score = []
for col in range(Y_dev.shape[1]):
    val = Y_br[:,col][np.logical_not(Y_dev[:,col])]
    if len(val) > 0:
        max_neg_score.append(np.max(val))
print(np.array(max_neg_score))



In [454]:

    
#print(np.array(min_pos_score)-np.array(max_neg_score))

M2. PC - Multilabel p-classification

P-Classification ~ P-norm push ranking.



In [455]:

    
pc = PClassificationMLC(C1=1, weighting='labels')
pc.fit(X_train, Y_train)









    



C: 1, 1, 1, p: 1, weighting: samples

Evaluation: normalise per playlist.



In [456]:

    
print('Dev set:')
eval_pl(Y_dev, pc.predict(X_dev))









    



Dev set:
Average over 93 columns
Mean P@K:       0.1731
Mean AUC:       0.6712
MAP:            0.0881
Mean NRR:       0.1018



In [457]:

    
print('Training set:')
eval_pl(Y_train, pc.predict(X_train))









    



Training set:
Average over 108 columns
Mean P@K:       0.2931
Mean AUC:       0.9269
MAP:            0.3109
Mean NRR:       0.4544



In [458]:

    
min_pos_score = []
for col in range(Y_test.shape[1]):
    val = Y_pc[:,col][Y_test[:,col]]
    if len(val) > 0:
        min_pos_score.append(np.min(val))
    else:
        min_pos_score.append(np.nan)
#plt.hist((np.array(min_pos_score)))
#plt.hist((np.nan_to_num(min_pos_score)), bins=30)
#print(np.array(min_pos_score))
#print()



In [459]:

    
max_neg_score = []
for col in range(Y_test.shape[1]):
    val = Y_pc[:,col][np.logical_not(Y_test[:,col])]
    if len(val) > 0:
        max_neg_score.append(np.max(val))
#plt.hist(np.array(max_neg_score), bins=30)
#print()



In [460]:

    
#plt.hist(np.nan_to_num(min_pos_score)-np.array(max_neg_score), bins=30)
#print()

Setting II: hold a subset of playlists, use all songs



In [538]:

    
nondev_song_set2 = sorted({sid for sid in song_set if sid not in dev_song_set2})
print(len(nondev_song_set2) + len(dev_song_set2), len(song_set))



In [539]:

    
playlists2 = [t[1] for t in train_playlists + dev_playlists]



In [542]:

    
X, Y = gen_dataset_subset(playlists=playlists2, song_set=nondev_song_set2 + dev_song_set2, 
                          features_MSD=song2Features, song2genre=song2genre)



In [543]:

    
Y_train = Y.copy().astype(np.float)  # note: np.nan is float
Y_train[len(nondev_song_set2):, len(train_playlists):] = np.nan
Y_dev = Y[-len(dev_song_set2):, len(train_playlists):]



In [544]:

    
#Y_train



In [545]:

    
print(np.sum(np.isnan(Y_train)), len(dev_playlists) * len(dev_song_set2))









    



17101 17101



In [546]:

    
X_train = X

Feature normalisation.



In [547]:

    
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev = X_train[len(nondev_song_set2):]



In [548]:

    
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))









    



Train:      (786, 218)      (786, 108)
Dev  :      (349, 218)       (349, 49)



In [549]:

    
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 1)









    



3.43710489009e-17
-0.0458900154908
0.0029949467376
-0.060802933072



In [550]:

    
#np.sum(Y_train, axis=0)



In [551]:

    
#np.sum(Y_dev, axis=0)

M3. Independent logistic regression



In [552]:

    
br2 = BinaryRelevance(C=1, n_jobs=4)
br2.fit(X_train, np.nan_to_num(Y_train))









    



/home/dawei/apps/miniconda3/lib/python3.6/site-packages/sklearn/multiclass.py:76: UserWarning: Label not 60 is present in all training examples.
  str(classes[c]))



In [553]:

    
col_start = -len(dev_playlists)

Evaluation: normalise per playlist.



In [554]:

    
print('Dev set:')
eval_pl(Y_dev, br2.predict(X_dev)[:, col_start:])









    



Dev set:
Average over 49 columns
Mean P@K:       0.0560
Mean AUC:       0.5824
MAP:            0.0543
Mean NRR:       0.0740



In [555]:

    
Y_train_gt = np.nan_to_num(Y_train).astype(np.bool)
Y_train_pred = br2.predict(X_train)
Y_train_pred[:, col_start:] = 0   # remove test region
print('Training set:')
eval_pl(Y_train_gt, Y_train_pred)









    



Training set:
Average over 107 columns
Mean P@K:       0.9914
Mean AUC:       0.7756
MAP:            0.5527
Mean NRR:       0.5533

M4. Multilabel p-classification with some playlist fully observed



In [556]:

    
user_of_playlists2 = [t[0] for t in train_playlists + dev_playlists]
#user_of_playlists2



In [557]:

    
same_user_mat = np.zeros((len(playlists2), len(playlists2)), dtype=np.bool)
for i in range(len(playlists2)):
    for j in range(i+1, len(playlists2)):
        if user_of_playlists2[i] == user_of_playlists2[j]:
            same_user_mat[i, j] = True
            same_user_mat[j, i] = True



In [558]:

    
#same_user_mat



In [559]:

    
pla = PClassificationMLC(C1=1, C2=1, C3=10, weighting='both', similarMat=same_user_mat)
pla.fit(X_train, Y_train)









    



C: 1, 1, 10, p: 1, weighting: both



In [560]:

    
col_start = -len(dev_playlists)

Evaluation: normalise per playlist.



In [561]:

    
eval_pl(Y_dev, pla.predict(X_dev)[:, col_start:])









    



Average over 49 columns
Mean P@K:       0.0406
Mean AUC:       0.6130
MAP:            0.0527
Mean NRR:       0.0731



In [562]:

    
Y_train_gt = np.nan_to_num(Y_train).astype(np.bool)
Y_train_pred = pla.predict(X_train)
Y_train_pred[:, col_start:] = 0   # remove test region
eval_pl(Y_train_gt, Y_train_pred)









    



Average over 107 columns
Mean P@K:       0.5214
Mean AUC:       0.6775
MAP:            0.0718
Mean NRR:       0.1203

Check the if the regulariser is effective



In [563]:

    
%%script false
rows, cols = np.nonzero(same_user_mat)
for row, col in zip(rows, cols):
    diff = pla.W[row] - pla.W[col]
    print('%g' % np.sqrt(np.dot(pla.W[row], pla.W[row])))
    print('%g' % np.sqrt(np.dot(pla.W[col], pla.W[col])))
    print('%g' % np.sqrt(np.dot(diff, diff)))
    print('------------------------------')

Compute matrix $M$ such that $M_{jk} = \sqrt{(w_j - w_k)^\top (w_j - w_k)}, \forall j, k$.



In [564]:

    
A = np.dot(pla.W, pla.W.T)
B = np.tile(np.diag(A), (A.shape[0], 1))
M = np.sqrt(-2 * A + (B + B.T))

Normalise $M$ by the vector with maximum norm in $W$.



In [565]:

    
#aa = np.arange(6).reshape(3, 2)
#np.einsum('ij,ij->i', aa, aa)



In [566]:

    
denorm = np.sqrt(np.einsum('ij,ij->i', pla.W, pla.W))  # compute the norm for each row in W



In [567]:

    
M1 = M / np.max(denorm)



In [568]:

    
plt.matshow(M1)









    Out[568]:





<matplotlib.image.AxesImage at 0x7f66f8ee7f28>



In [569]:

    
user_of_playlists2









    Out[569]:





[(914763600.0, 'Grey'),
 (1050674400.0, 'World Class Chris'),
 (1076418000.0, 'Garick'),
 (1076418000.0, 'Tom Jones'),
 (1078923600.0, 'Kurt87'),
 (1115992800.0, 'm12788'),
 (1190728800.0, 'EvaninCambridge'),
 (1058364000.0, 'Meg Usta'),
 (1092578400.0, 'XxPunk4PeacexX'),
 (1049205600.0, 'hobbitcore'),
 (1049205600.0, 'hobbitcore'),
 (1008421200.0, 'Chris Bowron'),
 (1008421200.0, 'Chris Bowron'),
 (1079787600.0, 'tragic3rdwoman'),
 (1079787600.0, 'tragic3rdwoman'),
 (1079787600.0, 'tragic3rdwoman'),
 (1069246800.0, 'Axelsson'),
 (1069246800.0, 'Axelsson'),
 (1069246800.0, 'Axelsson'),
 (1069246800.0, 'Axelsson'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1058364000.0, 'Meg Usta'),
 (1092578400.0, 'XxPunk4PeacexX'),
 (1049205600.0, 'hobbitcore'),
 (1008421200.0, 'Chris Bowron'),
 (1008421200.0, 'Chris Bowron'),
 (1079787600.0, 'tragic3rdwoman'),
 (1079787600.0, 'tragic3rdwoman'),
 (1069246800.0, 'Axelsson'),
 (1069246800.0, 'Axelsson'),
 (1069246800.0, 'Axelsson'),
 (1069246800.0, 'Axelsson'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1012482000.0, 'lynn*'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød'),
 (1009198800.0, 'jarød')]



In [530]:

    
rows, cols = np.nonzero(same_user_mat)
M2 = M1[rows, cols]
print(np.min(M2), np.max(M2), np.mean(M2), np.std(M2))









    



0.036170196335 1.21916828901 0.0910317432502 0.06538921066



In [531]:

    
mat = same_user_mat.copy()
np.fill_diagonal(mat, 1)   # remove the diagnoal from consideration
rows, cols = np.where(mat == 0)
M3 = M1[rows, cols]
print(np.min(M3), np.max(M3), np.mean(M3), np.std(M3))









    



0.108495434792 1.38378472385 0.439785901404 0.190869133289

Check performance per user



In [570]:

    
user_set = sorted(set(user_of_playlists2))
user_set









    Out[570]:





[(914763600.0, 'Grey'),
 (1008421200.0, 'Chris Bowron'),
 (1009198800.0, 'jarød'),
 (1012482000.0, 'lynn*'),
 (1049205600.0, 'hobbitcore'),
 (1050674400.0, 'World Class Chris'),
 (1058364000.0, 'Meg Usta'),
 (1069246800.0, 'Axelsson'),
 (1076418000.0, 'Garick'),
 (1076418000.0, 'Tom Jones'),
 (1078923600.0, 'Kurt87'),
 (1079787600.0, 'tragic3rdwoman'),
 (1092578400.0, 'XxPunk4PeacexX'),
 (1115992800.0, 'm12788'),
 (1190728800.0, 'EvaninCambridge')]



In [571]:

    
#user_of_playlists2



In [572]:

    
Y_pla = pla.predict(X_dev)[:, col_start:]
dev_col_start = len(train_playlists)
for u in user_set:
    uind = np.where(np.array(user_of_playlists2, dtype=np.object) == u)[0]
    ntrain = len(uind)
    if len(uind) < 2: continue  # filtering out users with less than 2 playlists
    uind -= dev_col_start
    uind = uind[uind >= 0]
    ntest = len(uind)
    #print(uind)
    if len(uind) < 1: continue
    print('--------------------')
    print('USER:', u)
    print('#train: %d, #test: %d' % (ntrain, ntest))
    eval_pl(Y_dev[:, uind], Y_pla[:, uind])
    print()









    



--------------------
USER: (1008421200.0, 'Chris Bowron')
#train: 4, #test: 2
Average over 2 columns
Mean P@K:       0.0000
Mean AUC:       0.6950
MAP:            0.0311
Mean NRR:       0.0280

--------------------
USER: (1009198800.0, 'jarød')
#train: 62, #test: 31
Average over 31 columns
Mean P@K:       0.0266
Mean AUC:       0.6137
MAP:            0.0449
Mean NRR:       0.0659

--------------------
USER: (1012482000.0, 'lynn*')
#train: 15, #test: 7
Average over 7 columns
Mean P@K:       0.0238
Mean AUC:       0.5806
MAP:            0.0276
Mean NRR:       0.0394

--------------------
USER: (1049205600.0, 'hobbitcore')
#train: 3, #test: 1
Average over 1 columns
Mean P@K:       0.5000
Mean AUC:       0.8357
MAP:            0.5086
Mean NRR:       0.6724

--------------------
USER: (1058364000.0, 'Meg Usta')
#train: 2, #test: 1
Average over 1 columns
Mean P@K:       0.0000
Mean AUC:       0.0144
MAP:            0.0029
Mean NRR:       0.0029

--------------------
USER: (1069246800.0, 'Axelsson')
#train: 8, #test: 4
Average over 4 columns
Mean P@K:       0.0000
Mean AUC:       0.6646
MAP:            0.0376
Mean NRR:       0.0442

--------------------
USER: (1079787600.0, 'tragic3rdwoman')
#train: 5, #test: 2
Average over 2 columns
Mean P@K:       0.2500
Mean AUC:       0.7179
MAP:            0.1160
Mean NRR:       0.1497

--------------------
USER: (1092578400.0, 'XxPunk4PeacexX')
#train: 2, #test: 1
Average over 1 columns
Mean P@K:       0.0000
Mean AUC:       0.6161
MAP:            0.0427
Mean NRR:       0.0533

M4. Multilabel p-classification with unknows in test set



In [ ]:

    
N, K = Y.shape



In [ ]:

    
Y_nan = Y.copy().astype(np.float)
np.random.seed(8967321)
rand_num = int(0.2 * N)
ones = 0
for k in range(K):
    randix = np.random.permutation(np.arange(N))[:rand_num]
    Y_nan[randix, k] = np.nan
    ones += Y[randix, k].sum()

The number of NaN entries.



In [ ]:

    
np.sum(np.isnan(Y_nan))

Train: keep running util no overflow warning occurred.



In [ ]:

    
pc2 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc2.fit(X, Y_nan)

Prediction: use the minimum of positive entry score of the same example as threshold.
Evaluation: use F1 on all unknown entries (as a 1D array).



In [ ]:

    
Y_pred2 = pc2.predict(X)



In [ ]:

    
pos_index = np.nan_to_num(Y_nan).astype(np.bool)
nan_index = np.isnan(Y_nan)



In [ ]:

    
ground_truths = Y[nan_index]



In [ ]:

    
thresholds = []
preds = []
for k in range(K):
    val = Y_pred2[:, k][pos_index[:, k]]
    th = np.min(val)
    thresholds.append(th)
    preds += (Y_pred2[nan_index[:,k], k] >= th).tolist()



In [ ]:

    
f1_score_nowarn(ground_truths, preds, average='binary')



In [ ]:

    
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)

M5. Multilabel p-classification w/ (some playlist fully observed) & (unknowns in test set)



In [ ]:

    
Y_nan_part2 = Y.copy().astype(np.float)
np.random.seed(8967321)
rand_num = int(0.4 * N)
ones = 0
for k in range(int(K/2), K):
    randix = np.random.permutation(np.arange(N))[:rand_num]
    Y_nan_part2[randix, k] = np.nan
    ones += Y[randix, k].sum()

The number of NaN entries.



In [ ]:

    
np.sum(np.isnan(Y_nan_part2))



In [ ]:

    
pc4 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc4.fit(X, Y_nan_part2)

Prediction: use the minimum of positive entry score of the same example as threshold.
Evaluation: use F1 on all unknown entries (as a 1D array).



In [ ]:

    
Y_pred4 = pc4.predict(X)



In [ ]:

    
pos_index = np.nan_to_num(Y_nan_part2).astype(np.bool)
nan_index = np.isnan(Y_nan_part2)



In [ ]:

    
ground_truths = Y[nan_index]



In [ ]:

    
thresholds = []
preds = []
for k in range(int(K/2), K):
    val = Y_pred4[:, k][pos_index[:, k]]
    th = np.min(val)
    #th = np.mean(val)
    thresholds.append(th)
    preds += (Y_pred4[nan_index[:,k], k] >= th).tolist()



In [ ]:

    
f1_score_nowarn(ground_truths, preds, average='binary')



In [ ]:

    
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)



In [ ]:

    
2 * np.sum(np.logical_and(ground_truths, preds)) / (np.sum(ground_truths) + np.sum(preds))