Baselines - new song recommendation


In [1]:
%matplotlib inline

import os, sys, time, gzip
import pickle as pkl
import numpy as np
from scipy.sparse import lil_matrix, csr_matrix, issparse
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics, diversity, pairwise_distance_hamming, softmax

In [3]:
np.seterr(all='raise')


Out[3]:
{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [4]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 700, 1000]

In [5]:
datasets = ['aotm2011', '30music']

In [6]:
dix = 1
dataset_name = datasets[dix]
dataset_name


Out[6]:
'30music'

In [7]:
data_dir = 'data/%s/coldstart/setting1' % dataset_name
X_trndev = pkl.load(gzip.open(os.path.join(data_dir, 'X_trndev.pkl.gz'), 'rb'))
Y_trndev = pkl.load(gzip.open(os.path.join(data_dir, 'Y_trndev.pkl.gz'), 'rb'))
X_test = pkl.load(gzip.open(os.path.join(data_dir, 'X_test.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))

In [8]:
songs1 = pkl.load(gzip.open(os.path.join(data_dir, 'songs_train_dev_test_s1.pkl.gz'), 'rb'))
train_songs = songs1['train_song_set']
dev_songs = songs1['dev_song_set']
test_songs = songs1['test_song_set']

In [9]:
song2index_trndev = {sid: ix for ix, (sid, _) in enumerate(train_songs + dev_songs)}
song2index_test = {sid: ix for ix, (sid, _) in enumerate(test_songs)}
index2song_test = {ix: sid for ix, (sid, _) in enumerate(test_songs)}

In [10]:
_song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
song2artist = {sid: _song2artist[sid] for sid, _ in train_songs + dev_songs + test_songs if sid in _song2artist}

In [11]:
all_playlists = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_s1.pkl.gz'), 'rb'))

In [12]:
artist2pop = dict()
test_songset = set(test_songs)

for pl, _ in all_playlists:
    for sid in [sid for sid in pl if sid not in test_songset]:
        if sid in song2artist:
            aid = song2artist[sid]
            try:
                artist2pop[aid] += 1
            except KeyError:
                artist2pop[aid] = 1

In [13]:
song2genre = pkl.load(gzip.open('data/msd/song2genre.pkl.gz', 'rb'))

In [14]:
# all_genre = set(song2genre.values())
# all_genre

In [15]:
cliques_all = pkl.load(gzip.open(os.path.join(data_dir, 'cliques_trndev.pkl.gz'), 'rb'))

In [16]:
U = len(cliques_all)
pl2u = np.zeros(Y_test.shape[1], dtype=np.int32)
for u in range(U):
    clq = cliques_all[u]
    pl2u[clq] = u

In [17]:
# Y_user = np.zeros((Y_test.shape[0], U), dtype=np.int)
# for u in range(U):
#     clq = cliques_all[u]
#     Y_user[:, u] = Y_test[:, clq].sum(axis=1).A.reshape(-1).astype(np.bool).astype(np.int)
# Y_user = csr_matrix(Y_user)
# print(Y_user.shape)

In [18]:
song2pop = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop.pkl.gz'), 'rb'))

In [19]:
X_test.shape


Out[19]:
(5000, 63)

In [20]:
Y_test.shape


Out[20]:
(5000, 17342)

In [21]:
# Y_test_csr = Y_test.tocsr()

Note that p XOR q = ( p AND NOT q ) OR ( NOT p AND q ) from here, let $\mathbf{p}, \mathbf{q} \in \{0, 1\}^{n}$, then

$ \begin{aligned} & \text{Hamming_distance}(\mathbf{p}, \mathbf{q}) \\ & = \frac{1}{n} \sum_{i=1}^n p_i \ \text{XOR} \ q_i \\ & = \frac{1}{n} \sum_{i=1}^n \left( p_i (1 - q_i) + (1 - p_i) q_i \right) \\ & = \frac{1}{n} \left( \sum_{i=1}^n p_i (1 - q_i) + \sum_{i=1}^n (1 - p_i) q_i \right) \\ & = \frac{1}{n} \left( \mathbf{p}^\top (\mathbf{1} - \mathbf{q}) + (\mathbf{1} - \mathbf{p})^\top \mathbf{q} \right) \\ & = \frac{1}{n} \left( \text{sum}(\mathbf{p}) + \text{sum}(\mathbf{q}) - 2 \mathbf{p}^\top \mathbf{q} \right) \end{aligned} $


In [22]:
N, D = 1000, 200
aa = np.zeros(N * D, dtype=np.int)
idx = np.random.permutation(N * D)[:int(N * D * .3)]
aa[idx] = 1
aa = aa.reshape(N, D)
d1 = pairwise_distances(aa, metric='hamming', n_jobs=2)
d2 = (np.dot(aa, 1-aa.T) + np.dot(1-aa, aa.T)) / D
sum_vec = aa.sum(axis=1, keepdims=True)
d3 = (sum_vec + sum_vec.T - 2 * np.dot(aa, aa.T)) / D
diff = (d1 - d2).ravel()
print(np.dot(diff, diff))
diff2 = (d1 - d3).ravel()
print(np.dot(diff2, diff2))


4.981471727196028e-28
4.981471727196028e-28

In [23]:
# aa = Y_test_csr[:500, :].A
# aa = Y_user[:10, :].A
# aa_csr = csr_matrix(aa)
# t0 = time.time()
# d1 = pairwise_distances(aa, metric='hamming', n_jobs=2)
# t1 = time.time()
# d2 = pairwise_distance_hamming(aa_csr)
# t2 = time.time()
# diff = (d1 - d2.A).ravel()
# print(np.sqrt(np.dot(diff, diff)))
# print('%.3f sec, %.3f sec' % (t1 - t0, t2 - t1))

In [24]:
# def diversity(vec):
#     assert vec.ndim == 1
#     norm = len(vec) * (len(vec) - 1)
#     sim_mat = vec[..., np.newaxis] == vec[np.newaxis, ...]  # pairwise comparison
#     # dist_mat = 1 - sim_mat
#     # return (dist_mat.sum() - dist_mat.trace()) / norm  # note that dist_mat.trace() = 0
#     return (1 - sim_mat).sum() / norm

Popularity (of artist) based recommendation


In [25]:
Y_test[:, 3].A.reshape(-1).dtype == np.bool


Out[25]:
True

In [26]:
Y_test[:, 3].A.reshape(-1).sum()


Out[26]:
0

In [27]:
Y_test[:, 3].A.reshape(-1)[(1 - Y_test[:, 3].A.reshape(-1))].shape


Out[27]:
(5000,)

In [28]:
Y_test[:, 3].A.reshape(-1)[(1 - Y_test[:, 3].A.reshape(-1)).astype(np.bool)].shape


Out[28]:
(5000,)

In [29]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []
spreads_pop = []
novelties_pop = {top: dict() for top in TOPs}
# diversities_pop = []
artist_diversities_pop = {top: [] for top in TOPs}
genre_diversities_pop = {top: [] for top in TOPs}
ptops_pop = []
np.random.seed(0)

y_pred = np.zeros(len(test_songs))
for ix in range(len(test_songs)):
    sid = index2song_test[ix]
    if sid in song2artist:
        aid = song2artist[sid]
        if aid in artist2pop:
            y_pred[ix] = np.log(artist2pop[aid])

npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()

    if npos[j] < 1:
        continue
        
    y_true = Y_test[:, j].A.reshape(-1)

    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_pop.append(rp)
    for top in TOPs:
        hitrates_pop[top].append(hr_dict[top])
    aucs_pop.append(auc)
    
    # spread
    y_pred_prob = softmax(y_pred)
    spreads_pop.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))

    # novelty
    sortix = np.argsort(-y_pred)
    u = pl2u[j]
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:top]])
        try:
            novelties_pop[top][u].append(nov)
        except KeyError:
            novelties_pop[top][u] = [nov]
    
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos[j]
    ptops_pop.append(pt)

    # compute diversity@100
    # sim = cosine_similarity(X_test[sortix[:100], :])
    # sim = cosine_similarity(Y_user[sortix[:100], :])
    # csd = 1. / cosine_similarity(X_test[sortix[:100], :])
    # dist = pairwise_distances(Y_test_csr[sortix[:100], :].A, metric='hamming', n_jobs=4)
    # dist = pairwise_distance_hamming(Y_test_csr[sortix[:100], :], normalise=True)
    # dist = pairwise_distance_hamming(Y_user[sortix[:50], :], normalise=True)
    # div = 100 * 99 / (sim.sum() - sim.trace())
    # diversities_pop.append(div)
    
    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song_test[ix]] for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song_test[ix]] if index2song_test[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_pop[top].append( diversity(artist_vec) )
#         genre_diversities_pop[top].append( diversity(genre_vec) )
    
print('\n%d / %d' % (len(rps_pop), Y_test.shape[1]))


17300 / 17342
8215 / 17342

In [30]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_pop, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_pop, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [31]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop), 
                                    'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in TOPs},
                                    'AUC': np.mean(aucs_pop),
                                    'Spread': np.mean(spreads_pop),
                                    'Novelty': {t: np.mean([np.mean(novelties_pop[t][u]) for u in novelties_pop[t]]) 
                                                for t in TOPs},
                                    'PTop': np.mean(ptops_pop),
                                    #'Artist-Diversity': {top: np.mean(artist_diversities_pop[top]) for top in TOPs},
                                    #'Genre-Diversity': {top: np.mean(genre_diversities_pop[top]) for top in TOPs}},
                                    # 'Novelty': np.mean([np.mean(novelty_pop[u]) for u in novelty_pop]),
                                    # 'Diveristy': np.mean(diversities_pop)},
                                   },
                           'Test_All': {'R-Precision': rps_pop,
                                        'Hit-Rate': {top: hitrates_pop[top] for top in TOPs},
                                        'AUC': aucs_pop,
                                        'Spread': spreads_pop,
                                        'Novelty': novelties_pop,
                                        'PTop': ptops_pop,
                                        #'Artist-Diversity': artist_diversities_pop,
                                        #'Genre-Diversity': genre_diversities_pop},
                                        # 'Novelty': novelty_pop,
                                        # 'Diversity': diversities_pop},
                          }}}
pop_perf[dataset_name]['Test']


Out[31]:
{'R-Precision': 0.005203867316768658,
 'Hit-Rate': {5: 0.013328379539578565,
  10: 0.01745343146743025,
  20: 0.04239344314298478,
  30: 0.046325904135701396,
  50: 0.06587742420171959,
  100: 0.12189305939747051,
  200: 0.15982659365799431,
  300: 0.23000178915292693,
  500: 0.3464156702960598,
  700: 0.41762750417701194,
  1000: 0.508522684424406},
 'AUC': 0.7093575433337442,
 'Spread': 7.396529421141267,
 'Novelty': {5: -4.631410163543202,
  10: -3.8555794509654193,
  20: -4.234888556521792,
  30: -3.6032449824617605,
  50: -3.6311567329369514,
  100: -3.795928427450747,
  200: -3.1438790996939834,
  300: -3.147187786687166,
  500: -3.026529370637619,
  700: -2.8183193719559148,
  1000: -2.5801843454877087},
 'PTop': 0.0007100831811726518}

In [32]:
fperf_pop = os.path.join(data_dir, 'perf-pop.pkl')
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting1/perf-pop.pkl
Out[32]:
{'R-Precision': 0.005203867316768658,
 'Hit-Rate': {5: 0.013328379539578565,
  10: 0.01745343146743025,
  20: 0.04239344314298478,
  30: 0.046325904135701396,
  50: 0.06587742420171959,
  100: 0.12189305939747051,
  200: 0.15982659365799431,
  300: 0.23000178915292693,
  500: 0.3464156702960598,
  700: 0.41762750417701194,
  1000: 0.508522684424406},
 'AUC': 0.7093575433337442,
 'Spread': 7.396529421141267,
 'Novelty': {5: -4.631410163543202,
  10: -3.8555794509654193,
  20: -4.234888556521792,
  30: -3.6032449824617605,
  50: -3.6311567329369514,
  100: -3.795928427450747,
  200: -3.1438790996939834,
  300: -3.147187786687166,
  500: -3.026529370637619,
  700: -2.8183193719559148,
  1000: -2.5801843454877087},
 'PTop': 0.0007100831811726518}

Same Artists - Greatest Hits (SAGH)

Recommend according to the popularity of artists in listening history.


In [25]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []
spreads_sagh = []
novelties_sagh = {top: dict() for top in TOPs}
ptops_sagh = []
# diversities_sagh = []
# artist_diversities_sagh = {top: [] for top in TOPs}
# genre_diversities_sagh = {top: [] for top in TOPs}
np.random.seed(0)

npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    if npos[j] < 1:
        continue

    y_true = Y_test[:, j].A.reshape(-1)
    y_pred = np.zeros(y_true.shape)
    
    pl = all_playlists[j][0]
    artists = set([song2artist[sid] for sid in pl if (sid not in test_songset) and (sid in song2artist)])
    assert len(artists) > 0
    
    for ix in range(Y_test.shape[0]):
        sid = index2song_test[ix]
        if sid in song2artist:
            aid = song2artist[sid]
            if aid in artists and aid in artist2pop:
                y_pred[ix] = np.log(artist2pop[aid])
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_sagh.append(rp)
    for top in TOPs:
        hitrates_sagh[top].append(hr_dict[top])
    aucs_sagh.append(auc)
    
    # spread
    y_pred_prob = softmax(y_pred)
    spreads_sagh.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))

    # novelty
    sortix = np.argsort(-y_pred)
    u = pl2u[j]
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:top]])
        try:
            novelties_sagh[top][u].append(nov)
        except KeyError:
            novelties_sagh[top][u] = [nov]
            
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos[j]
    ptops_sagh.append(pt)

    # compute diversity@100
    # csd = 1. / cosine_similarity(X_test[sortix[:100], :])
    # dist = pairwise_distance_hamming(Y_test_csr[sortix[:100], :])
    # diversities_sagh.append((dist.sum() - np.trace(dist)) / (100 * 99))
    
    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song_test[ix]] if index2song_test[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song_test[ix]] if index2song_test[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_sagh[top].append( diversity(artist_vec) )
#         genre_diversities_sagh[top].append( diversity(genre_vec) )
    
print('\n%d / %d' % (len(rps_sagh), Y_test.shape[1]))


17300 / 17342
8215 / 17342

In [26]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_sagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_sagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [27]:
sagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in TOPs},
                                     'AUC': np.mean(aucs_sagh),
                                     'Spread': np.mean(spreads_sagh),
                                     'Novelty': {t: np.mean([np.mean(novelties_sagh[t][u]) 
                                                             for u in novelties_sagh[t]]) for t in TOPs},
                                     'PTop': np.mean(ptops_sagh),
                                     # 'Artist-Diversity': {t: np.mean(artist_diversities_sagh[t]) for t in TOPs},
                                     # 'Genre-Diversity': {t: np.mean(genre_diversities_sagh[t]) for t in TOPs}},
                                    },
                            'Test_All': {'R-Precision': rps_sagh,
                                        'Hit-Rate': {top: hitrates_sagh[top] for top in TOPs},
                                        'AUC': aucs_sagh,
                                        'Spread': spreads_sagh,
                                        'Novelty': novelties_sagh,
                                        'PTop': ptops_sagh, 
                                        # 'Artist-Diversity': artist_diversities_sagh,
                                        # 'Genre-Diversity': genre_diversities_sagh},
                           }}}
sagh_perf[dataset_name]['Test']


Out[27]:
{'R-Precision': 0.0058906181071328645,
 'Hit-Rate': {5: 0.010129273461055353,
  10: 0.01668065892592404,
  20: 0.025824710286042635,
  30: 0.030388153299415827,
  50: 0.03643224096499933,
  100: 0.047171460440967976,
  200: 0.07414110401376134,
  300: 0.10445369660991596,
  500: 0.1485900338030378,
  700: 0.20310122840124942,
  1000: 0.24831610839509155},
 'AUC': 0.5154321118343206,
 'Spread': 6.475404266745977,
 'Novelty': {5: -2.4882684631410146,
  10: -2.315565551503914,
  20: -1.9936531438312832,
  30: -1.7277073859076204,
  50: -1.4974086163795883,
  100: -1.3835290379979113,
  200: -1.3804877846139376,
  300: -1.4328591844597078,
  500: -1.3908542931429568,
  700: -1.3900595827713906,
  1000: -1.3518510108122386},
 'PTop': 0.0013694461351186854}

In [28]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh.pkl')
print(fperf_sagh)
pkl.dump(sagh_perf, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting1/perf-sagh.pkl
Out[28]:
{'R-Precision': 0.0058906181071328645,
 'Hit-Rate': {5: 0.010129273461055353,
  10: 0.01668065892592404,
  20: 0.025824710286042635,
  30: 0.030388153299415827,
  50: 0.03643224096499933,
  100: 0.047171460440967976,
  200: 0.07414110401376134,
  300: 0.10445369660991596,
  500: 0.1485900338030378,
  700: 0.20310122840124942,
  1000: 0.24831610839509155},
 'AUC': 0.5154321118343206,
 'Spread': 6.475404266745977,
 'Novelty': {5: -2.4882684631410146,
  10: -2.315565551503914,
  20: -1.9936531438312832,
  30: -1.7277073859076204,
  50: -1.4974086163795883,
  100: -1.3835290379979113,
  200: -1.3804877846139376,
  300: -1.4328591844597078,
  500: -1.3908542931429568,
  700: -1.3900595827713906,
  1000: -1.3518510108122386},
 'PTop': 0.0013694461351186854}

Collocated Artists - Greatest Hits (CAGH)

Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:
$$ \text{sim}(a_1, a_2) = \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)} {\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}} $$ where $$ \delta(a, p) = \begin{cases} 1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\ 0, \ \text{otherwise}. \end{cases} $$

Recommend according to the popularity of songs, but weighted by similarity of (artist in user's listening history, artist of song).


In [25]:
all_artist_trndev = sorted(set([song2artist[sid] for pl, _ in all_playlists for sid in pl \
                                if (sid not in test_songset) and (sid in song2artist)]))

In [26]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist_trndev)}

In [27]:
Na = len(all_artist_trndev)
Np = len(all_playlists)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
    pl_artist = sorted(set([song2artist[sid] for sid in all_playlists[j][0] \
                            if (sid not in test_songset) and (sid in song2artist)]))
    ix = [artist2index[aid] for aid in pl_artist]
    Delta[ix, j] = 1

In [28]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A

assert np.all(np.isclose(ColloMat.diagonal(), Dsum))

In [29]:
print(len(Dsum), len(all_artist_trndev))


9981 9981

In [30]:
#type(ColloMat)

In [31]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))

WeightMat = np.multiply(ColloMat, NormMat)

In [32]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []
spreads_cagh = []
novelties_cagh = {top: dict() for top in TOPs}
ptops_cagh = []
# diversities_cagh = []
# artist_diversities_cagh = {top: [] for top in TOPs}
# genre_diversities_cagh = {top: [] for top in TOPs}
np.random.seed(0)

npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
    if (j+1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    
    if npos[j] < 1:
        continue
    
    y_true = Y_test[:, j].A.reshape(-1)
    y_pred = np.zeros(y_true.shape)
    
    pl = all_playlists[j][0]
    artists = set([song2artist[sid] for sid in pl if (sid not in test_songset) and (sid in song2artist)])
    assert len(artists) > 0
    artists_ix = [artist2index[aid] for aid in artists]
    
    for ix in range(Y_test.shape[0]):
        sid = index2song_test[ix]
        if sid in song2artist:
            aid = song2artist[sid]
            if aid in artist2pop:
                aix = artist2index[aid]
                y_pred[ix] = np.log(artist2pop[aid]) * WeightMat[aix, artists_ix].sum()

    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_cagh.append(rp)
    for top in TOPs:
        hitrates_cagh[top].append(hr_dict[top])
    aucs_cagh.append(auc)
    
    # spread
    y_pred_prob = softmax(y_pred)
    spreads_cagh.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))

    # novelty
    sortix = np.argsort(-y_pred)
    u = pl2u[j]
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:top]])
        try:
            novelties_cagh[top][u].append(nov)
        except KeyError:
            novelties_cagh[top][u] = [nov]
            
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos[j]
    ptops_cagh.append(pt)
    
    # compute diversity@100
    # csd = 1. / cosine_similarity(X_test[sortix[:100], :])
    # dist = pairwise_distance_hamming(Y_test_csr[sortix[:100], :])
    # diversities_cagh.append((dist.sum() - np.trace(dist)) / (100 * 99))
    
    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song_test[ix]] if index2song_test[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song_test[ix]] if index2song_test[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_cagh[top].append( diversity(artist_vec) )
#         genre_diversities_cagh[top].append( diversity(genre_vec) )

print('\n%d / %d' % (len(rps_cagh), Y_test.shape[1]))


17340 / 17342
8215 / 17342

In [33]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_cagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_cagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [34]:
cagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in hitrates_cagh},
                                     'AUC': np.mean(aucs_cagh),
                                     'Spread': np.mean(spreads_cagh),
                                     'Novelty': {t: np.mean([np.mean(novelties_cagh[t][u]) 
                                                             for u in novelties_cagh[t]]) for t in TOPs},
                                     'PTop': np.mean(ptops_cagh),
                                     # 'Artist-Diversity': {t: np.mean(artist_diversities_cagh[t]) for t in TOPs},
                                     # 'Genre-Diversity': {t: np.mean(genre_diversities_cagh[t]) for t in TOPs}},
                                    },
                            'Test_All': {'R-Precision': rps_cagh,
                                        'Hit-Rate': {top: hitrates_cagh[top] for top in TOPs},
                                        'AUC': aucs_cagh,
                                        'Spread': spreads_cagh,
                                        'Novelty': novelties_cagh,
                                        'PTop': ptops_cagh,
                                        # 'Artist-Diversity': artist_diversities_cagh,
                                        # 'Genre-Diversity': genre_diversities_cagh},
                           }}}
cagh_perf[dataset_name]['Test']


Out[34]:
{'R-Precision': 0.005038845342843542,
 'Hit-Rate': {5: 0.00999448010806358,
  10: 0.017353031135799384,
  20: 0.03031135139228203,
  30: 0.04221813023157314,
  50: 0.06307996553359844,
  100: 0.10042079340913203,
  200: 0.15669076805366502,
  300: 0.20865694906983764,
  500: 0.29832722141527607,
  700: 0.3710588823840103,
  1000: 0.4659364063758232},
 'AUC': 0.6796715502309781,
 'Spread': 4.327457616401731,
 'Novelty': {5: -2.7667853477798365,
  10: -2.74375096212388,
  20: -2.7515989542796238,
  30: -2.7684845693182765,
  50: -2.820895598757543,
  100: -2.7984903896907594,
  200: -2.708008390820775,
  300: -2.647672906157982,
  500: -2.543548576791839,
  700: -2.446922137748246,
  1000: -2.310867507663371},
 'PTop': 0.0013694461351186854}

In [35]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh.pkl')
print(fperf_cagh)
pkl.dump(cagh_perf, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting1/perf-cagh.pkl
Out[35]:
{'R-Precision': 0.005038845342843542,
 'Hit-Rate': {5: 0.00999448010806358,
  10: 0.017353031135799384,
  20: 0.03031135139228203,
  30: 0.04221813023157314,
  50: 0.06307996553359844,
  100: 0.10042079340913203,
  200: 0.15669076805366502,
  300: 0.20865694906983764,
  500: 0.29832722141527607,
  700: 0.3710588823840103,
  1000: 0.4659364063758232},
 'AUC': 0.6796715502309781,
 'Spread': 4.327457616401731,
 'Novelty': {5: -2.7667853477798365,
  10: -2.74375096212388,
  20: -2.7515989542796238,
  30: -2.7684845693182765,
  50: -2.820895598757543,
  100: -2.7984903896907594,
  200: -2.708008390820775,
  300: -2.647672906157982,
  500: -2.543548576791839,
  700: -2.446922137748246,
  1000: -2.310867507663371},
 'PTop': 0.0013694461351186854}

Matrix Factorisation

Let $S \in \mathbb{R}^{M \times D}, P \in \mathbb{R}^{N \times D}, Y \in \mathbb{R}^{M \times N}$ be the latent factors of songs and playlists, respectively.

The optimisation objective: $ \begin{aligned} J = \sum{m=1}^M \sum{n=1}^N \left( y_{m,n} - \mathbf{s}_m^\top \mathbf{p}_n \right)^2

+ C \left( \sum_{m=1}^M \mathbf{s}_m^\top \mathbf{s}_m + \sum_{n=1}^N \mathbf{p}_n^\top \mathbf{p}_n \right)

\end{aligned} $
Use alternating least squares optimisation method:

  1. Fix $S$, then let $ \begin{aligned} \mathbf{0} = \frac{\partial J}{\partial \mathbf{p}_n} = \sum_{m=1}^M 2 \left( y_{m,n} - \mathbf{s}_m^\top \mathbf{p}_n \right) (-\mathbf{s}_m) + 2 C \mathbf{p}_n \end{aligned} $
    in other words $ \begin{aligned} \sum_{m=1}^M y_{m,n} \mathbf{s}_m = \sum_{m=1}^M (\mathbf{s}_m^\top \mathbf{p}_n^*) \mathbf{s}_m + C \mathbf{p}_n^* = \sum_{m=1}^M \mathbf{s}_m \mathbf{s}_m^\top \mathbf{p}_n^* + C \mathbf{p}_n^* = \left( \sum_{m=1}^M \mathbf{s}_m \mathbf{s}_m^\top + C \mathbf{I} \right) \mathbf{p}_n^* \end{aligned} $
    where $\mathbf{I} \in \mathbb{R}^{D \times D}$ diagonal matrix and the every element at diagonal is 1.
    So $ \begin{aligned} \mathbf{p}_n^* = \left( \sum_{m=1}^M \mathbf{s}_m \mathbf{s}_m^\top + C \mathbf{I} \right)^{-1} \sum_{m=1}^M y_{m,n} \mathbf{s}_m \end{aligned} $
    or equivalently $ \begin{aligned} \mathbf{p}_n^* = \left( S^\top S + C \mathbf{I} \right)^{-1} \left( \mathbf{y}_{:n}^\top S \right)^\top = \left( S^\top S + C \mathbf{I} \right)^{-1} S^\top \mathbf{y}_{:n} \end{aligned} $
    The matrix form is
    $ \begin{aligned} P' = \left( \left( S^\top S + C \mathbf{I} \right)^{-1} S^\top Y \right)^\top = Y^\top S \left( \left( S^\top S + C \mathbf{I} \right)^{-1} \right)^\top \end{aligned} $
  1. Fix $S$, then let $ \begin{aligned} \mathbf{0} = \frac{\partial J}{\partial \mathbf{s}_m} = \sum_{n=1}^N 2 \left( y_{m,n} - \mathbf{s}_m^\top \mathbf{p}_n \right) (-\mathbf{p}_n) + 2 C \mathbf{s}_m \end{aligned} $
    by symmetry, we have
    $ \begin{aligned} \mathbf{s}_m^* = \left( \sum_{n=1}^N \mathbf{p}_n \mathbf{p}_n^\top + C \mathbf{I} \right)^{-1} \sum_{n=1}^N y_{m,n} \mathbf{p}_n \end{aligned} $
    The matrix form is
    $ \begin{aligned} S' = \left( \left( P^\top P + C \mathbf{I} \right)^{-1} (Y P)^\top \right)^\top = Y P \left( \left( P^\top P + C \mathbf{I} \right)^{-1} \right)^\top \end{aligned} $

In [25]:
np.random.seed(0)
D = 300
C = 1e-5
n_sweeps = 200
M, N = Y_trndev.shape
S = np.random.rand(M, D)
P = np.random.rand(N, D)

# alternating least squares
for sweep in range(n_sweeps):
    # fix S, optimise P
    SS = np.dot(S.T, S)  # D by D
    np.fill_diagonal(SS, C + SS.diagonal())
    P_new = np.dot(Y_trndev.transpose().dot(S), np.linalg.inv(SS).T)  # N by D
    pdiff = (P_new - P).ravel()
    P = P_new
    
    # fix P, optimise S
    PP = np.dot(P.T, P)  # D by D
    np.fill_diagonal(PP, C + PP.diagonal())
    S_new = np.dot(Y_trndev.dot(P), np.linalg.inv(PP).T)  # M by D
    sdiff = (S_new - S).ravel()
    S = S_new
    print('P diff: {:8.6f}, S diff: {:8.6f}'.format(np.sqrt(pdiff.dot(pdiff)), np.sqrt(sdiff.dot(sdiff))))


P diff: 1316.574078, S diff: 3912.395199
P diff: 0.651017, S diff: 1621.246868
P diff: 0.309908, S diff: 816.477397
P diff: 0.175767, S diff: 503.835563
P diff: 0.115235, S diff: 349.528758
P diff: 0.082677, S diff: 261.020856
P diff: 0.062997, S diff: 204.774430
P diff: 0.050064, S diff: 166.307690
P diff: 0.041027, S diff: 138.587439
P diff: 0.034423, S diff: 117.834307
P diff: 0.029433, S diff: 101.835169
P diff: 0.025562, S diff: 89.205114
P diff: 0.022496, S diff: 79.034587
P diff: 0.020022, S diff: 70.704425
P diff: 0.017995, S diff: 63.780892
P diff: 0.016310, S diff: 57.952586
P diff: 0.014892, S diff: 52.991014
P diff: 0.013686, S diff: 48.725282
P diff: 0.012648, S diff: 45.025419
P diff: 0.011749, S diff: 41.791143
P diff: 0.010962, S diff: 38.944087
P diff: 0.010269, S diff: 36.422305
P diff: 0.009655, S diff: 34.176295
P diff: 0.009108, S diff: 32.166077
P diff: 0.008619, S diff: 30.359023
P diff: 0.008179, S diff: 28.728223
P diff: 0.007782, S diff: 27.251254
P diff: 0.007424, S diff: 25.909258
P diff: 0.007098, S diff: 24.686232
P diff: 0.006802, S diff: 23.568489
P diff: 0.006532, S diff: 22.544247
P diff: 0.006285, S diff: 21.603307
P diff: 0.006059, S diff: 20.736795
P diff: 0.005851, S diff: 19.936962
P diff: 0.005660, S diff: 19.197018
P diff: 0.005484, S diff: 18.510996
P diff: 0.005321, S diff: 17.873637
P diff: 0.005171, S diff: 17.280294
P diff: 0.005031, S diff: 16.726855
P diff: 0.004901, S diff: 16.209668
P diff: 0.004780, S diff: 15.725485
P diff: 0.004668, S diff: 15.271413
P diff: 0.004562, S diff: 14.844864
P diff: 0.004463, S diff: 14.443526
P diff: 0.004371, S diff: 14.065323
P diff: 0.004284, S diff: 13.708389
P diff: 0.004202, S diff: 13.371047
P diff: 0.004125, S diff: 13.051783
P diff: 0.004052, S diff: 12.749231
P diff: 0.003983, S diff: 12.462152
P diff: 0.003918, S diff: 12.189426
P diff: 0.003856, S diff: 11.930035
P diff: 0.003798, S diff: 11.683053
P diff: 0.003742, S diff: 11.447635
P diff: 0.003689, S diff: 11.223012
P diff: 0.003639, S diff: 11.008479
P diff: 0.003591, S diff: 10.803390
P diff: 0.003545, S diff: 10.607152
P diff: 0.003501, S diff: 10.419220
P diff: 0.003460, S diff: 10.239091
P diff: 0.003420, S diff: 10.066301
P diff: 0.003381, S diff: 9.900421
P diff: 0.003345, S diff: 9.741051
P diff: 0.003310, S diff: 9.587824
P diff: 0.003276, S diff: 9.440394
P diff: 0.003244, S diff: 9.298443
P diff: 0.003213, S diff: 9.161671
P diff: 0.003183, S diff: 9.029800
P diff: 0.003155, S diff: 8.902570
P diff: 0.003127, S diff: 8.779736
P diff: 0.003101, S diff: 8.661070
P diff: 0.003075, S diff: 8.546358
P diff: 0.003051, S diff: 8.435398
P diff: 0.003027, S diff: 8.328003
P diff: 0.003004, S diff: 8.223995
P diff: 0.002982, S diff: 8.123208
P diff: 0.002961, S diff: 8.025484
P diff: 0.002940, S diff: 7.930677
P diff: 0.002920, S diff: 7.838648
P diff: 0.002901, S diff: 7.749268
P diff: 0.002882, S diff: 7.662412
P diff: 0.002864, S diff: 7.577965
P diff: 0.002846, S diff: 7.495820
P diff: 0.002829, S diff: 7.415872
P diff: 0.002812, S diff: 7.338027
P diff: 0.002796, S diff: 7.262192
P diff: 0.002781, S diff: 7.188282
P diff: 0.002765, S diff: 7.116216
P diff: 0.002750, S diff: 7.045918
P diff: 0.002736, S diff: 6.977315
P diff: 0.002722, S diff: 6.910340
P diff: 0.002708, S diff: 6.844929
P diff: 0.002695, S diff: 6.781020
P diff: 0.002682, S diff: 6.718557
P diff: 0.002669, S diff: 6.657486
P diff: 0.002656, S diff: 6.597754
P diff: 0.002644, S diff: 6.539313
P diff: 0.002632, S diff: 6.482117
P diff: 0.002621, S diff: 6.426123
P diff: 0.002609, S diff: 6.371288
P diff: 0.002598, S diff: 6.317574
P diff: 0.002587, S diff: 6.264943
P diff: 0.002576, S diff: 6.213359
P diff: 0.002566, S diff: 6.162788
P diff: 0.002556, S diff: 6.113197
P diff: 0.002546, S diff: 6.064557
P diff: 0.002536, S diff: 6.016837
P diff: 0.002526, S diff: 5.970009
P diff: 0.002517, S diff: 5.924047
P diff: 0.002508, S diff: 5.878924
P diff: 0.002499, S diff: 5.834616
P diff: 0.002490, S diff: 5.791098
P diff: 0.002481, S diff: 5.748349
P diff: 0.002472, S diff: 5.706347
P diff: 0.002464, S diff: 5.665070
P diff: 0.002456, S diff: 5.624498
P diff: 0.002448, S diff: 5.584612
P diff: 0.002440, S diff: 5.545394
P diff: 0.002432, S diff: 5.506825
P diff: 0.002424, S diff: 5.468887
P diff: 0.002417, S diff: 5.431565
P diff: 0.002409, S diff: 5.394842
P diff: 0.002402, S diff: 5.358702
P diff: 0.002395, S diff: 5.323130
P diff: 0.002388, S diff: 5.288112
P diff: 0.002381, S diff: 5.253634
P diff: 0.002374, S diff: 5.219682
P diff: 0.002368, S diff: 5.186242
P diff: 0.002361, S diff: 5.153302
P diff: 0.002355, S diff: 5.120850
P diff: 0.002348, S diff: 5.088874
P diff: 0.002342, S diff: 5.057362
P diff: 0.002336, S diff: 5.026302
P diff: 0.002330, S diff: 4.995684
P diff: 0.002324, S diff: 4.965497
P diff: 0.002318, S diff: 4.935731
P diff: 0.002312, S diff: 4.906376
P diff: 0.002307, S diff: 4.877422
P diff: 0.002301, S diff: 4.848860
P diff: 0.002296, S diff: 4.820680
P diff: 0.002290, S diff: 4.792874
P diff: 0.002285, S diff: 4.765433
P diff: 0.002280, S diff: 4.738349
P diff: 0.002275, S diff: 4.711613
P diff: 0.002269, S diff: 4.685218
P diff: 0.002264, S diff: 4.659156
P diff: 0.002259, S diff: 4.633419
P diff: 0.002255, S diff: 4.608000
P diff: 0.002250, S diff: 4.582892
P diff: 0.002245, S diff: 4.558088
P diff: 0.002240, S diff: 4.533581
P diff: 0.002236, S diff: 4.509366
P diff: 0.002231, S diff: 4.485434
P diff: 0.002227, S diff: 4.461781
P diff: 0.002222, S diff: 4.438400
P diff: 0.002218, S diff: 4.415286
P diff: 0.002213, S diff: 4.392432
P diff: 0.002209, S diff: 4.369833
P diff: 0.002205, S diff: 4.347485
P diff: 0.002201, S diff: 4.325380
P diff: 0.002196, S diff: 4.303515
P diff: 0.002192, S diff: 4.281884
P diff: 0.002188, S diff: 4.260482
P diff: 0.002184, S diff: 4.239306
P diff: 0.002180, S diff: 4.218349
P diff: 0.002176, S diff: 4.197608
P diff: 0.002172, S diff: 4.177078
P diff: 0.002168, S diff: 4.156755
P diff: 0.002165, S diff: 4.136635
P diff: 0.002161, S diff: 4.116713
P diff: 0.002157, S diff: 4.096987
P diff: 0.002153, S diff: 4.077451
P diff: 0.002150, S diff: 4.058103
P diff: 0.002146, S diff: 4.038939
P diff: 0.002142, S diff: 4.019954
P diff: 0.002139, S diff: 4.001146
P diff: 0.002135, S diff: 3.982512
P diff: 0.002132, S diff: 3.964047
P diff: 0.002128, S diff: 3.945749
P diff: 0.002125, S diff: 3.927615
P diff: 0.002121, S diff: 3.909642
P diff: 0.002118, S diff: 3.891827
P diff: 0.002114, S diff: 3.874166
P diff: 0.002111, S diff: 3.856658
P diff: 0.002107, S diff: 3.839299
P diff: 0.002104, S diff: 3.822087
P diff: 0.002101, S diff: 3.805019
P diff: 0.002097, S diff: 3.788093
P diff: 0.002094, S diff: 3.771307
P diff: 0.002091, S diff: 3.754657
P diff: 0.002088, S diff: 3.738143
P diff: 0.002084, S diff: 3.721760
P diff: 0.002081, S diff: 3.705508
P diff: 0.002078, S diff: 3.689385
P diff: 0.002075, S diff: 3.673387
P diff: 0.002072, S diff: 3.657514
P diff: 0.002068, S diff: 3.641763
P diff: 0.002065, S diff: 3.626132
P diff: 0.002062, S diff: 3.610620
P diff: 0.002059, S diff: 3.595224

Sanity check, RMSE


In [26]:
loss = 0.
Y_trndev_coo = Y_trndev.tocoo()
for row, col in zip(Y_trndev_coo.row, Y_trndev_coo.col):
    diff = S[row, :].dot(P[col, :]) - 1
    loss += diff * diff
loss /= Y_trndev_coo.nnz
print('RMSE:', np.sqrt(loss))


RMSE: 0.8348427075588326

Map song features to song latent factors

Learn an MLP to map song features to song latent factors, adapted from here


In [27]:
tf.set_random_seed(0)
if dataset_name == 'aotm2011':
    batch_size = 8192
    n_hidden = 512
    n_epochs = 20
else:
    batch_size = 1024
    n_hidden = 512
    n_epochs = 40
    
input_shape = (batch_size, X_trndev.shape[1])
dimensions = D
model = tf.keras.Sequential()
model.add(
    tf.layers.Dense(
        units = n_hidden,
        input_dim = X_trndev.shape[1],
        activation = 'sigmoid'))
model.add(tf.layers.Dense(units = dimensions))
model.compile(loss=tf.keras.losses.MSE,
              optimizer=tf.keras.optimizers.Adam())
model.fit(X_trndev, S, epochs=n_epochs, batch_size=batch_size)


Epoch 1/40
40468/40468 [==============================] - 1s 19us/step - loss: 0.6266
Epoch 2/40
40468/40468 [==============================] - 1s 14us/step - loss: 0.5827
Epoch 3/40
40468/40468 [==============================] - 1s 14us/step - loss: 0.5780
Epoch 4/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5756
Epoch 5/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5739
Epoch 6/40
40468/40468 [==============================] - 1s 14us/step - loss: 0.5725
Epoch 7/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5716
Epoch 8/40
40468/40468 [==============================] - 1s 14us/step - loss: 0.5706
Epoch 9/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5701
Epoch 10/40
40468/40468 [==============================] - 1s 14us/step - loss: 0.5694
Epoch 11/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5689
Epoch 12/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5687
Epoch 13/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5682
Epoch 14/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5679
Epoch 15/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5676
Epoch 16/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5673
Epoch 17/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5671
Epoch 18/40
40468/40468 [==============================] - 1s 14us/step - loss: 0.5670
Epoch 19/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5667
Epoch 20/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5664
Epoch 21/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5663
Epoch 22/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5661
Epoch 23/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5660
Epoch 24/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5658
Epoch 25/40
40468/40468 [==============================] - 1s 14us/step - loss: 0.5655
Epoch 26/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5651
Epoch 27/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5650
Epoch 28/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5645
Epoch 29/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5643
Epoch 30/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5640
Epoch 31/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5635
Epoch 32/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5632
Epoch 33/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5628
Epoch 34/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5623
Epoch 35/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5618
Epoch 36/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5611
Epoch 37/40
40468/40468 [==============================] - 1s 15us/step - loss: 0.5607
Epoch 38/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5602
Epoch 39/40
40468/40468 [==============================] - 1s 16us/step - loss: 0.5595
Epoch 40/40
40468/40468 [==============================] - 1s 14us/step - loss: 0.5587
Out[27]:
<tensorflow.python.keras.callbacks.History at 0x7f6c8304a6d8>

In [28]:
X_test_factors = model.predict(X_test, batch_size=X_test.shape[0])
X_test_factors.shape


Out[28]:
(5000, 300)

In [29]:
rps_mf = []
hitrates_mf = {top: [] for top in TOPs}
aucs_mf = []
spreads_mf = []
novelties_mf = {top: dict() for top in TOPs}
ptops_mf = []
# artist_diversities_mf = {top: [] for top in TOPs}
# genre_diversities_mf = {top: [] for top in TOPs}
np.random.seed(0)

npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()

    if npos[j] < 1:
        continue
        
    y_true = Y_test[:, j].A.reshape(-1)
    y_pred = np.dot(X_test_factors, P[j])

    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_mf.append(rp)
    for top in TOPs:
        hitrates_mf[top].append(hr_dict[top])
    aucs_mf.append(auc)
    
    # spread
    y_pred_prob = softmax(y_pred)
    spreads_mf.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))

    # novelty
    sortix = np.argsort(-y_pred)
    u = pl2u[j]
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:top]])
        try:
            novelties_mf[top][u].append(nov)
        except KeyError:
            novelties_mf[top][u] = [nov]
    
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos[j]
    ptops_mf.append(pt)
    
    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song_test[ix]] if index2song_test[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song_test[ix]] if index2song_test[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_mf[top].append( diversity(artist_vec) )
#         genre_diversities_mf[top].append( diversity(genre_vec) )
    
print('\n%d / %d' % (len(rps_mf), Y_test.shape[1]))


17300 / 17342
8215 / 17342

In [30]:
perf_mf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_mf), 
                                  'Hit-Rate': {top: np.mean(hitrates_mf[top]) for top in TOPs},
                                  'AUC': np.mean(aucs_mf),
                                  'Spread': np.mean(spreads_mf),
                                  'Novelty': {t: np.mean([np.mean(novelties_mf[t][u]) for u in novelties_mf[t]]) 
                                              for t in TOPs},
                                  'PTop': np.mean(ptops_mf),
                                  # 'Artist-Diversity': {top: np.mean(artist_diversities_mf[top]) for top in TOPs},
                                  # 'Genre-Diversity': {top: np.mean(genre_diversities_mf[top]) for top in TOPs}},
                                  },
                          'Test_All': {'R-Precision': rps_mf,
                                       'Hit-Rate': {top: hitrates_mf[top] for top in TOPs},
                                       'AUC': aucs_mf,
                                       'Spread': spreads_mf,
                                       'Novelty': novelties_mf,
                                       'PTop': ptops_mf,
                                       # 'Artist-Diversity': artist_diversities_mf,
                                       # 'Genre-Diversity': genre_diversities_mf}}}
                                      }}}
perf_mf[dataset_name]['Test']


Out[30]:
{'R-Precision': 0.038166174531040105,
 'Hit-Rate': {5: 0.05960214133651911,
  10: 0.08612297379386138,
  20: 0.1225691088068307,
  30: 0.15236368778496293,
  50: 0.19903768753803333,
  100: 0.2785863033085597,
  200: 0.3830391347053173,
  300: 0.4538939740834353,
  500: 0.5594879010085065,
  700: 0.63103020036896,
  1000: 0.7077604622772226},
 'AUC': 0.8140231867027435,
 'Spread': 8.517191673556496,
 'Novelty': {5: -3.1346224230630986,
  10: -3.049614741081545,
  20: -2.97092830320734,
  30: -2.9210127681875586,
  50: -2.847655412643849,
  100: -2.728113108586504,
  200: -2.5778971099149577,
  300: -2.472568233532807,
  500: -2.3123196370735415,
  700: -2.1843251177716825,
  1000: -2.0267584321002774},
 'PTop': 0.023228090136761654}

In [31]:
fperf_mf = os.path.join(data_dir, 'perf-mf.pkl')
print(fperf_mf)
pkl.dump(perf_mf, open(fperf_mf, 'wb'))
pkl.load(open(fperf_mf, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting1/perf-mf.pkl
Out[31]:
{'R-Precision': 0.038166174531040105,
 'Hit-Rate': {5: 0.05960214133651911,
  10: 0.08612297379386138,
  20: 0.1225691088068307,
  30: 0.15236368778496293,
  50: 0.19903768753803333,
  100: 0.2785863033085597,
  200: 0.3830391347053173,
  300: 0.4538939740834353,
  500: 0.5594879010085065,
  700: 0.63103020036896,
  1000: 0.7077604622772226},
 'AUC': 0.8140231867027435,
 'Spread': 8.517191673556496,
 'Novelty': {5: -3.1346224230630986,
  10: -3.049614741081545,
  20: -2.97092830320734,
  30: -2.9210127681875586,
  50: -2.847655412643849,
  100: -2.728113108586504,
  200: -2.5778971099149577,
  300: -2.472568233532807,
  500: -2.3123196370735415,
  700: -2.1843251177716825,
  1000: -2.0267584321002774},
 'PTop': 0.023228090136761654}