A representative subset of AotM-2011 Playlists with MSD Audio Features



In [1]:

    
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys
import gzip
import pickle as pkl
import numpy as np
import pandas as pd
from scipy.optimize import check_grad
from scipy.sparse import lil_matrix, issparse
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns



In [2]:

    
sys.path.append('src')
from models import BinaryRelevance
from models import PCMLC, objective
from tools import calc_precisionK, calc_rank, f1_score_nowarn, evalPred, calc_RPrecision_HitRate



In [3]:

    
data_dir = 'data/aotm2011'
fplaylist = os.path.join(data_dir, 'aotm2011-playlist.pkl.gz')
ffeature = 'data/msd/song2feature.pkl.gz'
fgenre = 'data/msd/song2genre.pkl.gz'
TOPs = [5, 10, 20, 30, 50, 100]#, 200, 300, 500#, 1000]

Load playlists

Load playlists.



In [4]:

    
all_playlists = pkl.load(gzip.open(fplaylist, 'rb'))



In [5]:

    
all_users = sorted(set({user for _, user in all_playlists}))



In [6]:

    
print('#user    :', len(all_users))
print('#playlist:', len(all_playlists))









    



#user    : 14182
#playlist: 84710



In [7]:

    
pl_lengths = [len(pl) for pl, _ in all_playlists]
#plt.hist(pl_lengths, bins=100)
print('Average playlist length: %.1f' % np.mean(pl_lengths))









    



Average playlist length: 10.1

Load song features

Load song_id --> feature array mapping: map a song to the audio features of one of its corresponding tracks in MSD.



In [8]:

    
song2feature = pkl.load(gzip.open(ffeature, 'rb'))



In [9]:

    
#mean_age = (np.sum(song_ages) - np.sum(song_ages[missing_ix])) / (len(song_ages) - len(missing_ix))
#mean_age

Subset of data

The user whose playlists cover a proper number of playlists, e.g. 50.



In [10]:

    
user_playlists = dict()
for pl, u in all_playlists:
    try: 
        user_playlists[u].append(pl)
    except KeyError:
        user_playlists[u] = [pl]



In [11]:

    
u_npl = sorted([(u, len(user_playlists[u])) for u in all_users], key=lambda x: x[1])



In [12]:

    
#u_npl



In [13]:

    
step = 1000  # sample 0.1%
subset = [u_npl[ix] for ix in np.arange(0, len(u_npl), step)]



In [14]:

    
subset









    Out[14]:





[((914763600.0, 'Grey'), 1),
 ((1050674400.0, 'World Class Chris'), 1),
 ((1076418000.0, 'Garick'), 1),
 ((1076418000.0, 'Tom Jones'), 1),
 ((1078923600.0, 'Kurt87'), 1),
 ((1115992800.0, 'm12788'), 1),
 ((1190728800.0, 'EvaninCambridge'), 1),
 ((1058364000.0, 'Meg Usta'), 2),
 ((1092578400.0, 'XxPunk4PeacexX'), 2),
 ((1049205600.0, 'hobbitcore'), 3),
 ((1008421200.0, 'Chris Bowron'), 4),
 ((1079787600.0, 'tragic3rdwoman'), 5),
 ((1069246800.0, 'Axelsson'), 8),
 ((1012482000.0, 'lynn*'), 15),
 ((1009198800.0, 'jarød'), 62)]



In [15]:

    
len(subset)









    Out[15]:





15



In [16]:

    
uid_subset = [t[0] for t in subset]



In [17]:

    
#udf[uid_subset]  # tuple are used as multiindex in pandas
#udf[[uid_subset]]



In [18]:

    
playlists_subset = [(pl, u) for u in uid_subset for pl in user_playlists[u]]



In [19]:

    
len(playlists_subset)









    Out[19]:





108



In [20]:

    
song_set = sorted([(sid, song2feature[sid][-1]) for sid in {sid for pl, _ in playlists_subset for sid in pl}],
                  key=lambda x: (x[1], x[0]))



In [21]:

    
print(len(song_set))
#song_set

Split songs for setting I

Split songs (80/20 split) the latest released (year) songs are in dev set ~~such that the distributions of song popularity (the number of occurrence in playlists) in training and dev set are similiar~~.



In [22]:

    
dev_nsongs = int(len(song_set) * 0.2)
dev_song_set = song_set[:dev_nsongs]
train_song_set = song_set[dev_nsongs:]
print('#songs in training set:', len(train_song_set))
print('#songs in test set    :', len(dev_song_set))









    



#songs in training set: 629
#songs in test set    : 157



In [23]:

    
#dev_song_set
#train_song_set



In [24]:

    
song2index = {sid: ix for ix, (sid, _) in enumerate(song_set)}
song_pl_mat = lil_matrix((len(song_set), len(playlists_subset)), dtype=np.int8)
for j in range(len(playlists_subset)):
    pl = playlists_subset[j][0]
    ind = [song2index[sid] for sid in pl]
    song_pl_mat[ind, j] = 1



In [25]:

    
song_pop = np.sum(song_pl_mat, axis=1)



In [26]:

    
song2pop = {sid: song_pop[song2index[sid], 0] for (sid, _) in song_set}

Histogram of song popularity in training set.



In [27]:

    
train_song_pop = [song2pop[sid] for (sid, _) in train_song_set]
ax = plt.subplot(111)
ax.hist(train_song_pop, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(train_song_set))

Histogram of song popularity in dev set.



In [28]:

    
dev_song_pop = [song2pop[sid] for (sid, _) in dev_song_set]
ax = plt.subplot(111)
ax.hist(dev_song_pop, bins=30)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+1)
print(len(dev_song_set))

Split playlists

Split playlists (80/20 split) uniformly at random ~~such that the distributions of playlist length (the number of songs in playlists) for each user in training and dev set are similiar~~.



In [29]:

    
train_playlists = []
dev_playlists = []



In [30]:

    
dev_ratio = 0.2
np.random.seed(987654321)
for u in uid_subset:
    playlists_u = [(pl, u) for pl in user_playlists[u]]
    if len(user_playlists[u]) < 5:
        train_playlists += playlists_u
    else:
        npl_dev  = int(dev_ratio * len(user_playlists[u]))
        pl_indices = np.random.permutation(len(user_playlists[u]))
        dev_playlists  += playlists_u[:npl_dev]
        train_playlists += playlists_u[npl_dev:]



In [31]:

    
print(len(train_playlists), len(dev_playlists))



In [32]:

    
xmax = np.max([len(pl) for pl, _ in playlists_subset]) + 1

Histogram of playlist length in training set.



In [33]:

    
ax = plt.subplot(111)
ax.hist([len(pl) for pl, _ in train_playlists], bins=50)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
print(len(train_playlists))

Histogram of playlist length in training set.



In [34]:

    
ax = plt.subplot(111)
ax.hist([len(pl) for pl, _ in dev_playlists], bins=50)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
print(len(dev_playlists))

Hold part of songs in the dev set of playlists

Hold the last half of songs for each playlist in dev set.



In [35]:

    
dev_playlists_obs = [pl[:-int(len(pl)/2)] for (pl, _) in dev_playlists]
dev_playlists_held = [pl[-int(len(pl)/2):] for (pl, _) in dev_playlists]



In [36]:

    
for i in range(len(dev_playlists)):
    assert np.all(dev_playlists[i][0] == dev_playlists_obs[i] + dev_playlists_held[i])



In [37]:

    
print('obs: %d, held: %d' % (np.sum([len(ppl) for ppl in dev_playlists_obs]), 
                             np.sum([len(ppl) for ppl in dev_playlists_held])))









    



obs: 99, held: 90

Load genres



In [38]:

    
song2genre = pkl.load(gzip.open(fgenre, 'rb'))

Check if all songs have genre info.



In [39]:

    
np.sum([sid in song2genre for sid, _ in song_set])









    Out[39]:





577

Create song-playlist matrix

Songs as rows, playlists as columns.



In [40]:

    
def gen_dataset(playlists, song2feature, song2genre, train_song_set, 
                dev_song_set=[], test_song_set=[], song2pop_train=None):
    """
    Create labelled dataset: rows are songs, columns are users.
    
    Input:
        - playlists: a set of playlists
        - train_song_set: a list of songIDs in training set
        - dev_song_set: a list of songIDs in dev set
        - test_song_set: a list of songIDs in test set
        - song2feature: dictionary that maps songIDs to features from MSD
        - song2genre: dictionary that maps songIDs to genre
        - song2pop_train: a dictionary that maps songIDs to its popularity
    Output:
        - (Feature, Label) pair (X, Y)
          X: #songs by #features
          Y: #songs by #users
    """ 
    song_set = train_song_set + dev_song_set + test_song_set
    N = len(song_set)
    K = len(playlists)
    
    genre_set = sorted({v for v in song2genre.values()})
    genre2index = {genre: ix for ix, genre in enumerate(genre_set)}
    
    def onehot_genre(songID):
        """
        One-hot encoding of genres.
        Data imputation: 
            - one extra entry for songs without genre info
            - mean imputation
            - sampling from the distribution of genre popularity
        """
        num = len(genre_set) # + 1
        vec = np.zeros(num, dtype=np.float)
        if songID in song2genre:
            genre_ix = genre2index[song2genre[songID]]
            vec[genre_ix] = 1
        else:
            vec[:] = np.nan
            #vec[-1] = 1
        return vec
    
    #X = np.array([features_MSD[sid] for sid in song_set])  # without using genre
    #Y = np.zeros((N, K), dtype=np.bool)
    X = np.array([np.concatenate([song2feature[sid], onehot_genre(sid)], axis=-1) for sid in song_set])
    Y = lil_matrix((N, K), dtype=np.bool)
    
    song2index = {sid: ix for ix, sid in enumerate(song_set)}
    for k in range(K):
        pl = playlists[k]
        indices = [song2index[sid] for sid in pl if sid in song2index]
        Y[indices, k] = True
        
    # genre imputation
    genre_ix_start = -len(genre_set)
    genre_nan = np.isnan(X[:, genre_ix_start:])
    genre_mean = np.nansum(X[:, genre_ix_start:], axis=0) / (X.shape[0] - np.sum(genre_nan, axis=0))
    #print(np.nansum(X[:, genre_ix_start:], axis=0))
    #print(genre_set)
    #print(genre_mean)
    for j in range(len(genre_set)):
        X[genre_nan[:,j], j+genre_ix_start] = genre_mean[j]
        
    # normalise the sum of all genres per song to 1
    # X[:, -len(genre_set):] /= X[:, -len(genre_set):].sum(axis=1).reshape(-1, 1)  
    # NOTE: this is not necessary, as the imputed values are guaranteed to be normalised (sum to 1) 
    # due to the above method to compute mean genres.
    
    # the log of song popularity
    if song2pop_train is not None:
        # for sid in song_set: 
        #     assert sid in song2pop_train  # trust the input
        logsongpop = np.log([song2pop_train[sid]+1 for sid in song_set])  # deal with 0 popularity
        X = np.hstack([X, logsongpop.reshape(-1, 1)])

    #return X, Y
    Y = Y.tocsr()
    
    train_ix = [song2index[sid] for sid in train_song_set]
    X_train = X[train_ix, :]
    Y_train = Y[train_ix, :]
    
    dev_ix = [song2index[sid] for sid in dev_song_set]
    X_dev = X[dev_ix, :]
    Y_dev = Y[dev_ix, :]
    
    test_ix = [song2index[sid] for sid in test_song_set]
    X_test = X[test_ix, :]
    Y_test = Y[test_ix, :]
    
    if len(dev_song_set) > 0:
        if len(test_song_set) > 0:
            return X_train, Y_train, X_dev, Y_dev, X_test, Y_test
        else:
            return X_train, Y_train, X_dev, Y_dev
    else:
        if len(test_song_set) > 0:
            return X_train, Y_train, X_test, Y_test
        else:
            return X_train, Y_train



In [41]:

    
def mean_normalised_reciprocal_rank(Y_true, Y_pred):
    """
    Compute the mean of normalised reciprocal rank (reciprocal rank are normalised by the best possible ranks)
    """
    normalised_reci_rank = []
    npos = np.sum(Y_true, axis=0)
    for k in range(Y_true.shape[1]):
        ranks = calc_rank(Y_pred[:, k])[Y_true[:, k]]
        if len(ranks) > 0:
            ideal = np.sum([1./nk for nk in range(1, npos[k]+1)])
            real = np.sum([1./r for r in ranks])
            normalised_reci_rank.append(real / ideal)  # normalise the reciprocal ranks by the best possible ranks
    return np.mean(normalised_reci_rank)



In [42]:

    
def calc_RP_HR(Y_true, Y_pred, useLoop=False, tops=[]):
    rps = []
    hitrates = {top: [] for top in tops} if len(tops) > 0 else None
    if useLoop is True:
        assert type(Y_true) == type(Y_pred) == list
        assert len(Y_true) == len(Y_pred)
        for j in range(len(Y_true)):
            y_true = np.asarray(Y_true[j]).reshape(-1)
            y_pred = np.asarray(Y_pred[j]).reshape(-1)
            npos = y_true.sum()
            if npos > 0:
                rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=tops)
                rps.append(rp)
                if len(tops) > 0:
                    for top in tops:
                        hitrates[top].append(hr_dict[top])
    else:
        assert Y_true.shape == Y_pred.shape
        for j in range(Y_true.shape[1]):
            y_true = Y_true[:, j]
            if issparse(y_true):
                y_true = y_true.toarray().reshape(-1)
            else:
                y_true = y_true.reshape(-1)
            y_pred = Y_pred[:, j].reshape(-1)
            npos = y_true.sum()
            if npos > 0:
                rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=tops)
                rps.append(rp)
                if len(tops) > 0:
                    for top in tops:
                        hitrates[top].append(hr_dict[top])
    return rps, hitrates

Build the adjacent matrix of playlists (nodes), playlists of the same user form a clique.



In [43]:

    
user_of_playlists = [u for (_, u) in train_playlists + dev_playlists]
clique_list = []
for u in sorted(set(user_of_playlists)):
    clique = np.where(u == np.array(user_of_playlists, dtype=np.object))[0]
    if len(clique) > 1:
        clique_list.append(clique)

Setting I: hold a subset of songs, use all playlists



In [44]:

    
X_train, Y_train, X_dev, Y_dev = gen_dataset(playlists = [pl for pl, _ in playlists_subset],
                                             song2feature = song2feature, song2genre = song2genre,
                                             train_song_set = [sid for sid, _ in train_song_set],
                                             dev_song_set   = [sid for sid, _ in   dev_song_set])

Feature normalisation.



In [45]:

    
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev   -= X_train_mean
X_dev   /= X_train_std



In [46]:

    
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))









    



Train:      (629, 217)      (629, 108)
Dev  :      (157, 217)      (157, 108)



In [47]:

    
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 1)









    



-2.4134155916556296e-16
-0.04610157241305746
0.021083933678394208
-0.087645961790772

Check gradient.



In [48]:

    
%%script false
w0 = 0.001 * np.random.randn(Y_dev.shape[1] * X_dev.shape[1] + 1)
dw = np.zeros_like(w0)
cliques=clique_list
cliques=None
bs=100
loss='example'
check_grad(\
lambda w: objective(w, dw, X_dev, Y_dev, C1=2, C2=3, C3=5, p=3, cliques=cliques, loss_type=loss, batch_size=bs),
lambda w: dw, w0)

M1. BR - Independent logistic regression



In [49]:

    
br1 = BinaryRelevance(C=1, n_jobs=4)
br1.fit(X_train, Y_train)









    



/home/dawei/apps/miniconda3/lib/python3.6/site-packages/sklearn/multiclass.py:76: UserWarning: Label not 51 is present in all training examples.
  str(classes[c]))

Evaluation: normalise per playlist.



In [50]:

    
print('Dev set:')
rps_br1, hr_br1 = calc_RP_HR(Y_dev, br1.predict(X_dev), tops=TOPs)
print('R-Precision:', np.mean(rps_br1))
for top in TOPs:
    print(top, np.mean(hr_br1[top]))









    



Dev set:
R-Precision: 0.03804347826086957
5 0.0791062801932367
10 0.10954106280193236
20 0.19930986887508623
30 0.27486197377501725
50 0.4365079365079365
100 0.7285024154589372



In [51]:

    
print('Training set:')
rps_brtrn, hr_brtrn = calc_RP_HR(Y_train, br1.predict(X_train), tops=TOPs)
print('R-Precision:', np.mean(rps_brtrn))
for top in TOPs:
    print(top, np.mean(hr_brtrn[top]))









    



Training set:
R-Precision: 0.9966938925601964
5 0.6672889162660713
10 0.9193204523214914
20 0.9973908766226823
30 0.9986954383113411
50 1.0
100 1.0

M2. PC - Multilabel p-classification

P-Classification ~ P-norm push ranking.



In [52]:

    
mlr = PCMLC(C1=1, C2=1, C3=1, loss_type='example')
mlr.fit(X_train, Y_train, user_playlist_indices=clique_list)



In [53]:

    
print('Dev set:')
rps_mlr, hr_mlr = calc_RP_HR(Y_dev, mlr.predict(X_dev), tops=TOPs)
print('R-Precision:', np.mean(rps_mlr))
for top in TOPs:
    print(top, np.mean(hr_mlr[top]))









    



Dev set:
R-Precision: 0.048033126293995865
5 0.07624223602484471
10 0.15634920634920635
20 0.2783643892339544
30 0.37634575569358175
50 0.5648378191856452
100 0.8916494133885438



In [55]:

    
print('Training set:')
rps_mlrtrn, hr_mlrtrn = calc_RP_HR(Y_train, mlr.predict(X_train), tops=TOPs)
print('R-Precision:', np.mean(rps_mlrtrn))
for top in TOPs:
    print(top, np.mean(hr_mlrtrn[top]))









    



Training set:
R-Precision: 0.344103820245123
5 0.28874988934644014
10 0.3832373920607014
20 0.5022185896916681
30 0.5798116165671232
50 0.6784763105924777
100 0.802045607113128

Performance per user



In [56]:

    
%%script false
aucs = []
npl = []
for u in set(user_of_playlists2):
    uind = np.where(np.array(user_of_playlists2, dtype=np.object) == u)[0]
    uind_train = uind[uind < col_split]
    uind_test = uind[uind >= col_split]
    #print(uind)
    #if len(uind_test) < 1: continue
    print('--------------------')
    print('USER:', u)
    print('#train: %d, #test: %d' % (len(uind_train), len(uind_test)))
    #preds_test = [Y_pla[isnanmat[:, col], col] for col in uind_test]
    #gts_test = [Y[isnanmat[:, col], col] for col in uind_test]
    print('Dev set:')
    #auc, ncol = eval_pl(gts_test, preds_test, useLoop=True)
    #aucs.append(auc)
    #npl.append(ncol)
    print('Training set:')
    auc_trn, ncol_trn = eval_pl(Y[:, uind_train], Y_pla[:, uind_train])
    aucs.append(auc_trn)
    npl.append(ncol_trn)
    print()



In [57]:

    
%%script false
plt.plot(npl, aucs, 'b.')
plt.ylim(0.4, 1.0)
split = 'train'
plt.xlabel('#playlist in training set')
plt.ylabel('AUC (%s)' % split)
plt.title('New Song Recommednation Task: 1/#playlists reg. (%s)' % split)

Setting II: hold a subset of songs in a subset of playlists, use all songs



In [58]:

    
song2pop_train = song2pop.copy()
for ppl in dev_playlists_held:
    for sid in ppl:
        song2pop_train[sid] -= 1



In [59]:

    
X, Y = gen_dataset(playlists = [pl for pl, _ in train_playlists + dev_playlists],
                   song2feature = song2feature, song2genre = song2genre, 
                   train_song_set = [sid for sid, _ in song_set], song2pop_train=song2pop_train)



In [60]:

    
dev_cols = np.arange(len(train_playlists), Y.shape[1])
col_split = len(train_playlists)

Set all entries corresponding to playlists in dev set to NaN, except those songs in dev playlists that we observed.



In [61]:

    
Y_train = Y.copy().astype(np.float).toarray()  # note: np.nan is float
Y_train[:, dev_cols] = np.nan
song_indices = {sid: ix for ix, (sid, _) in enumerate(song_set)}
assert len(dev_cols) == len(dev_playlists) == len(dev_playlists_obs)
num_known = 0
for j in range(len(dev_cols)):
    rows = [song_indices[sid] for sid in dev_playlists_obs[j]]
    Y_train[rows, dev_cols[j]] = 1
    num_known += len(rows)



In [62]:

    
isnanmat = np.isnan(Y_train)



In [63]:

    
print('#unknown: {:,} | {:,}'.format(np.sum(isnanmat), len(dev_playlists) * len(song_set) - num_known))









    



#unknown: 13,263 | 13,263



In [64]:

    
print('#unknown in setting I: {:,}'.format(len(dev_song_set) * Y.shape[1]))









    



#unknown in setting I: 16,956



In [65]:

    
print(np.sum(isnanmat[:, :col_split]))  # number of NaN in training playlists, should be 0



In [66]:

    
X_train = X

Feature normalisation.



In [67]:

    
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std



In [68]:

    
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))









    



Train:      (786, 218)      (786, 108)



In [69]:

    
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)









    



1.1578348808516773e-17
-0.04589002449941504



In [70]:

    
len(dev_cols)









    Out[70]:





17

M3. Independent logistic regression



In [71]:

    
br2 = BinaryRelevance(C=1, n_jobs=4)
br2.fit(X_train, np.nan_to_num(Y_train))

Evaluation: normalise per playlist.



In [72]:

    
gts = [Y[isnanmat[:, col], col].toarray() for col in dev_cols]



In [73]:

    
print('Dev set:')
Y_br2 = br2.predict(X_train)
preds = [Y_br2[isnanmat[:, col], col] for col in dev_cols]
rps_br2, hr_br2 = calc_RP_HR(gts, preds, useLoop=True, tops=TOPs)
print('R-Precision:', np.mean(rps_br2))
for top in TOPs:
    print(top, np.mean(hr_br2[top]))









    



Dev set:
R-Precision: 0.022058823529411766
5 0.022058823529411766
10 0.030462184873949576
20 0.08298319327731092
30 0.10210084033613444
50 0.12962184873949578
100 0.1911064425770308



In [74]:

    
#rps



In [75]:

    
#Y_train[:, -1]

Popularity based ranking



In [76]:

    
rps_pop = []
hr_pop = {top: [] for top in TOPs}

index2song = {ix: sid for ix, (sid, _) in enumerate(song_set)}
for col in dev_cols:
    indices = np.arange(Y.shape[0])[isnanmat[:, col]]
    y_true = Y[indices, col]
    if issparse(y_true):
        y_true = y_true.toarray().reshape(-1)
    else:
        y_true = y_true.reshape(-1)
    y_pred = np.asarray([song2pop_train[index2song[ix]] for ix in indices])
    npos = y_true.sum()
    if npos > 0:
        rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
        rps_pop.append(rp)
        for top in TOPs:
            hr_pop[top].append(hr_dict[top])
print('R-Precision:', np.mean(rps_pop))
for top in TOPs:
    print(top, np.mean(hr_pop[top]))









    



R-Precision: 0.022058823529411766
5 0.022058823529411766
10 0.022058823529411766
20 0.03676470588235294
30 0.07563025210084033
50 0.10896358543417367
100 0.1914565826330532

M4. Multilabel p-classification with some playlist fully observed



In [77]:

    
pla = PCMLC(C1=10, C2=1, C3=10, p=1, loss_type='both')
pla.fit(X_train, np.nan_to_num(Y_train), batch_size=256, user_playlist_indices=clique_list)



In [78]:

    
print('Dev set:')
Y_pla = pla.predict(X_train)
preds_pla = [Y_pla[isnanmat[:, col], col] for col in dev_cols]
rps_pla, hr_pla = calc_RP_HR(gts, preds_pla, useLoop=True, tops=TOPs)
print('R-Precision:', np.mean(rps_pla))
for top in TOPs:
    print(top, np.mean(hr_pla[top]))









    



Dev set:
R-Precision: 0.023109243697478993
5 0.023109243697478993
10 0.032913165266106444
20 0.09173669467787114
30 0.10350140056022408
50 0.18256302521008402
100 0.3408263305322129



In [79]:

    
#rps_pla



In [80]:

    
#preds_pla[0]

Check performance per user



In [81]:

    
%%script false
aucs = []
npl = []
for u in set(user_of_playlists2):
    uind = np.where(np.array(user_of_playlists2, dtype=np.object) == u)[0]
    uind_train = uind[uind < col_split]
    uind_test = uind[uind >= col_split]
    #print(uind)
    #if len(uind_test) < 1: continue
    print('--------------------')
    print('USER:', u)
    print('#train: %d, #test: %d' % (len(uind_train), len(uind_test)))
    #preds_test = [Y_pla[isnanmat[:, col], col] for col in uind_test]
    #gts_test = [Y[isnanmat[:, col], col] for col in uind_test]
    #print('Dev set:')
    #auc, ncol = eval_pl(gts_test, preds_test, useLoop=True)
    #aucs.append(auc)
    #npl.append(ncol)
    print('Training set:')
    auc_trn, ncol_trn = eval_pl(Y[:, uind_train], Y_pla[:, uind_train])
    aucs.append(auc_trn)
    npl.append(ncol_trn)
    print()



In [82]:

    
%%script false
plt.plot(npl, aucs, 'b.')
plt.ylim(0.4, 1.0)
split = 'train'
plt.xlabel('#playlist in training set')
plt.ylabel('AUC (%s)' % split)
plt.title('Playlist augmentation task: No reg. (%s)' % split)

Check the if the regulariser is effective



In [83]:

    
%%script false
rows, cols = np.nonzero(same_user_mat)
for row, col in zip(rows, cols):
    diff = pla.W[row] - pla.W[col]
    print('%g' % np.sqrt(np.dot(pla.W[row], pla.W[row])))
    print('%g' % np.sqrt(np.dot(pla.W[col], pla.W[col])))
    print('%g' % np.sqrt(np.dot(diff, diff)))
    print('------------------------------')

Compute matrix $M$ such that $M_{jk} = \sqrt{(w_j - w_k)^\top (w_j - w_k)}, \forall j, k$.



In [84]:

    
%%script false
A = np.dot(pla.W, pla.W.T)
B = np.tile(np.diag(A), (A.shape[0], 1))
M = np.sqrt(-2 * A + (B + B.T))

Normalise $M$ by the vector with maximum norm in $W$.



In [85]:

    
#aa = np.arange(6).reshape(3, 2)
#np.einsum('ij,ij->i', aa, aa)



In [86]:

    
%%script false
denorm = np.sqrt(np.einsum('ij,ij->i', pla.W, pla.W))  # compute the norm for each row in W
M1 = M / np.max(denorm)



In [87]:

    
%%script false
plt.matshow(M1)



In [88]:

    
%%script false
rows, cols = np.nonzero(same_user_mat)
M2 = M1[rows, cols]
print('Min: %.5f, Max: %.5f, Mean: %.5f, Std: %.5f' % (np.min(M2), np.max(M2), np.mean(M2), np.std(M2)))



In [89]:

    
%%script false
mat = same_user_mat.copy()
np.fill_diagonal(mat, 1)   # remove the diagnoal from consideration
rows, cols = np.where(mat == 0)
M3 = M1[rows, cols]
print('Min: %.5f, Max: %.5f, Mean: %.5f, Std: %.5f' % (np.min(M3), np.max(M3), np.mean(M3), np.std(M3)))