In [1]:
%matplotlib inline
import os, sys, time, gzip
import pickle as pkl
import numpy as np
from scipy.sparse import lil_matrix, csr_matrix, issparse
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics, diversity, pairwise_distance_hamming, softmax
In [3]:
np.seterr(all='raise')
Out[3]:
In [4]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 700, 1000]
In [5]:
datasets = ['aotm2011', '30music']
In [6]:
dix = 1
dataset_name = datasets[dix]
dataset_name
Out[6]:
In [7]:
data_dir = 'data/%s/coldstart/setting1' % dataset_name
X_trndev = pkl.load(gzip.open(os.path.join(data_dir, 'X_trndev.pkl.gz'), 'rb'))
Y_trndev = pkl.load(gzip.open(os.path.join(data_dir, 'Y_trndev.pkl.gz'), 'rb'))
X_test = pkl.load(gzip.open(os.path.join(data_dir, 'X_test.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))
In [8]:
songs1 = pkl.load(gzip.open(os.path.join(data_dir, 'songs_train_dev_test_s1.pkl.gz'), 'rb'))
train_songs = songs1['train_song_set']
dev_songs = songs1['dev_song_set']
test_songs = songs1['test_song_set']
In [9]:
song2index_trndev = {sid: ix for ix, (sid, _) in enumerate(train_songs + dev_songs)}
song2index_test = {sid: ix for ix, (sid, _) in enumerate(test_songs)}
index2song_test = {ix: sid for ix, (sid, _) in enumerate(test_songs)}
In [10]:
_song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
song2artist = {sid: _song2artist[sid] for sid, _ in train_songs + dev_songs + test_songs if sid in _song2artist}
In [11]:
all_playlists = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_s1.pkl.gz'), 'rb'))
In [12]:
artist2pop = dict()
test_songset = set(test_songs)
for pl, _ in all_playlists:
for sid in [sid for sid in pl if sid not in test_songset]:
if sid in song2artist:
aid = song2artist[sid]
try:
artist2pop[aid] += 1
except KeyError:
artist2pop[aid] = 1
In [13]:
song2genre = pkl.load(gzip.open('data/msd/song2genre.pkl.gz', 'rb'))
In [14]:
# all_genre = set(song2genre.values())
# all_genre
In [15]:
cliques_all = pkl.load(gzip.open(os.path.join(data_dir, 'cliques_trndev.pkl.gz'), 'rb'))
In [16]:
U = len(cliques_all)
pl2u = np.zeros(Y_test.shape[1], dtype=np.int32)
for u in range(U):
clq = cliques_all[u]
pl2u[clq] = u
In [17]:
# Y_user = np.zeros((Y_test.shape[0], U), dtype=np.int)
# for u in range(U):
# clq = cliques_all[u]
# Y_user[:, u] = Y_test[:, clq].sum(axis=1).A.reshape(-1).astype(np.bool).astype(np.int)
# Y_user = csr_matrix(Y_user)
# print(Y_user.shape)
In [18]:
song2pop = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop.pkl.gz'), 'rb'))
In [19]:
X_test.shape
Out[19]:
In [20]:
Y_test.shape
Out[20]:
In [21]:
# Y_test_csr = Y_test.tocsr()
Note that p XOR q = ( p AND NOT q ) OR ( NOT p AND q )
from here,
let $\mathbf{p}, \mathbf{q} \in \{0, 1\}^{n}$, then
$ \begin{aligned} & \text{Hamming_distance}(\mathbf{p}, \mathbf{q}) \\ & = \frac{1}{n} \sum_{i=1}^n p_i \ \text{XOR} \ q_i \\ & = \frac{1}{n} \sum_{i=1}^n \left( p_i (1 - q_i) + (1 - p_i) q_i \right) \\ & = \frac{1}{n} \left( \sum_{i=1}^n p_i (1 - q_i) + \sum_{i=1}^n (1 - p_i) q_i \right) \\ & = \frac{1}{n} \left( \mathbf{p}^\top (\mathbf{1} - \mathbf{q}) + (\mathbf{1} - \mathbf{p})^\top \mathbf{q} \right) \\ & = \frac{1}{n} \left( \text{sum}(\mathbf{p}) + \text{sum}(\mathbf{q}) - 2 \mathbf{p}^\top \mathbf{q} \right) \end{aligned} $
In [22]:
N, D = 1000, 200
aa = np.zeros(N * D, dtype=np.int)
idx = np.random.permutation(N * D)[:int(N * D * .3)]
aa[idx] = 1
aa = aa.reshape(N, D)
d1 = pairwise_distances(aa, metric='hamming', n_jobs=2)
d2 = (np.dot(aa, 1-aa.T) + np.dot(1-aa, aa.T)) / D
sum_vec = aa.sum(axis=1, keepdims=True)
d3 = (sum_vec + sum_vec.T - 2 * np.dot(aa, aa.T)) / D
diff = (d1 - d2).ravel()
print(np.dot(diff, diff))
diff2 = (d1 - d3).ravel()
print(np.dot(diff2, diff2))
In [23]:
# aa = Y_test_csr[:500, :].A
# aa = Y_user[:10, :].A
# aa_csr = csr_matrix(aa)
# t0 = time.time()
# d1 = pairwise_distances(aa, metric='hamming', n_jobs=2)
# t1 = time.time()
# d2 = pairwise_distance_hamming(aa_csr)
# t2 = time.time()
# diff = (d1 - d2.A).ravel()
# print(np.sqrt(np.dot(diff, diff)))
# print('%.3f sec, %.3f sec' % (t1 - t0, t2 - t1))
In [24]:
# def diversity(vec):
# assert vec.ndim == 1
# norm = len(vec) * (len(vec) - 1)
# sim_mat = vec[..., np.newaxis] == vec[np.newaxis, ...] # pairwise comparison
# # dist_mat = 1 - sim_mat
# # return (dist_mat.sum() - dist_mat.trace()) / norm # note that dist_mat.trace() = 0
# return (1 - sim_mat).sum() / norm
In [25]:
Y_test[:, 3].A.reshape(-1).dtype == np.bool
Out[25]:
In [26]:
Y_test[:, 3].A.reshape(-1).sum()
Out[26]:
In [27]:
Y_test[:, 3].A.reshape(-1)[(1 - Y_test[:, 3].A.reshape(-1))].shape
Out[27]:
In [28]:
Y_test[:, 3].A.reshape(-1)[(1 - Y_test[:, 3].A.reshape(-1)).astype(np.bool)].shape
Out[28]:
In [29]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []
spreads_pop = []
novelties_pop = {top: dict() for top in TOPs}
# diversities_pop = []
artist_diversities_pop = {top: [] for top in TOPs}
genre_diversities_pop = {top: [] for top in TOPs}
ptops_pop = []
np.random.seed(0)
y_pred = np.zeros(len(test_songs))
for ix in range(len(test_songs)):
sid = index2song_test[ix]
if sid in song2artist:
aid = song2artist[sid]
if aid in artist2pop:
y_pred[ix] = np.log(artist2pop[aid])
npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
if npos[j] < 1:
continue
y_true = Y_test[:, j].A.reshape(-1)
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_pop.append(rp)
for top in TOPs:
hitrates_pop[top].append(hr_dict[top])
aucs_pop.append(auc)
# spread
y_pred_prob = softmax(y_pred)
spreads_pop.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))
# novelty
sortix = np.argsort(-y_pred)
u = pl2u[j]
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:top]])
try:
novelties_pop[top][u].append(nov)
except KeyError:
novelties_pop[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos[j]
ptops_pop.append(pt)
# compute diversity@100
# sim = cosine_similarity(X_test[sortix[:100], :])
# sim = cosine_similarity(Y_user[sortix[:100], :])
# csd = 1. / cosine_similarity(X_test[sortix[:100], :])
# dist = pairwise_distances(Y_test_csr[sortix[:100], :].A, metric='hamming', n_jobs=4)
# dist = pairwise_distance_hamming(Y_test_csr[sortix[:100], :], normalise=True)
# dist = pairwise_distance_hamming(Y_user[sortix[:50], :], normalise=True)
# div = 100 * 99 / (sim.sum() - sim.trace())
# diversities_pop.append(div)
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song_test[ix]] for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song_test[ix]] if index2song_test[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_pop[top].append( diversity(artist_vec) )
# genre_diversities_pop[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_pop), Y_test.shape[1]))
In [30]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_pop, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_pop, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass
In [31]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop),
'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in TOPs},
'AUC': np.mean(aucs_pop),
'Spread': np.mean(spreads_pop),
'Novelty': {t: np.mean([np.mean(novelties_pop[t][u]) for u in novelties_pop[t]])
for t in TOPs},
'PTop': np.mean(ptops_pop),
#'Artist-Diversity': {top: np.mean(artist_diversities_pop[top]) for top in TOPs},
#'Genre-Diversity': {top: np.mean(genre_diversities_pop[top]) for top in TOPs}},
# 'Novelty': np.mean([np.mean(novelty_pop[u]) for u in novelty_pop]),
# 'Diveristy': np.mean(diversities_pop)},
},
'Test_All': {'R-Precision': rps_pop,
'Hit-Rate': {top: hitrates_pop[top] for top in TOPs},
'AUC': aucs_pop,
'Spread': spreads_pop,
'Novelty': novelties_pop,
'PTop': ptops_pop,
#'Artist-Diversity': artist_diversities_pop,
#'Genre-Diversity': genre_diversities_pop},
# 'Novelty': novelty_pop,
# 'Diversity': diversities_pop},
}}}
pop_perf[dataset_name]['Test']
Out[31]:
In [32]:
fperf_pop = os.path.join(data_dir, 'perf-pop.pkl')
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))[dataset_name]['Test']
Out[32]:
Recommend according to the popularity of artists in listening history.
In [25]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []
spreads_sagh = []
novelties_sagh = {top: dict() for top in TOPs}
ptops_sagh = []
# diversities_sagh = []
# artist_diversities_sagh = {top: [] for top in TOPs}
# genre_diversities_sagh = {top: [] for top in TOPs}
np.random.seed(0)
npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
if npos[j] < 1:
continue
y_true = Y_test[:, j].A.reshape(-1)
y_pred = np.zeros(y_true.shape)
pl = all_playlists[j][0]
artists = set([song2artist[sid] for sid in pl if (sid not in test_songset) and (sid in song2artist)])
assert len(artists) > 0
for ix in range(Y_test.shape[0]):
sid = index2song_test[ix]
if sid in song2artist:
aid = song2artist[sid]
if aid in artists and aid in artist2pop:
y_pred[ix] = np.log(artist2pop[aid])
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_sagh.append(rp)
for top in TOPs:
hitrates_sagh[top].append(hr_dict[top])
aucs_sagh.append(auc)
# spread
y_pred_prob = softmax(y_pred)
spreads_sagh.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))
# novelty
sortix = np.argsort(-y_pred)
u = pl2u[j]
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:top]])
try:
novelties_sagh[top][u].append(nov)
except KeyError:
novelties_sagh[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos[j]
ptops_sagh.append(pt)
# compute diversity@100
# csd = 1. / cosine_similarity(X_test[sortix[:100], :])
# dist = pairwise_distance_hamming(Y_test_csr[sortix[:100], :])
# diversities_sagh.append((dist.sum() - np.trace(dist)) / (100 * 99))
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song_test[ix]] if index2song_test[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song_test[ix]] if index2song_test[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_sagh[top].append( diversity(artist_vec) )
# genre_diversities_sagh[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_sagh), Y_test.shape[1]))
In [26]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_sagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_sagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass
In [27]:
sagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh),
'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in TOPs},
'AUC': np.mean(aucs_sagh),
'Spread': np.mean(spreads_sagh),
'Novelty': {t: np.mean([np.mean(novelties_sagh[t][u])
for u in novelties_sagh[t]]) for t in TOPs},
'PTop': np.mean(ptops_sagh),
# 'Artist-Diversity': {t: np.mean(artist_diversities_sagh[t]) for t in TOPs},
# 'Genre-Diversity': {t: np.mean(genre_diversities_sagh[t]) for t in TOPs}},
},
'Test_All': {'R-Precision': rps_sagh,
'Hit-Rate': {top: hitrates_sagh[top] for top in TOPs},
'AUC': aucs_sagh,
'Spread': spreads_sagh,
'Novelty': novelties_sagh,
'PTop': ptops_sagh,
# 'Artist-Diversity': artist_diversities_sagh,
# 'Genre-Diversity': genre_diversities_sagh},
}}}
sagh_perf[dataset_name]['Test']
Out[27]:
In [28]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh.pkl')
print(fperf_sagh)
pkl.dump(sagh_perf, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))[dataset_name]['Test']
Out[28]:
Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:
$$
\text{sim}(a_1, a_2)
= \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)}
{\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}}
$$
where
$$
\delta(a, p)
= \begin{cases}
1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\
0, \ \text{otherwise}.
\end{cases}
$$
Recommend according to the popularity of songs, but weighted by similarity of (artist in user's listening history
, artist of song
).
In [25]:
all_artist_trndev = sorted(set([song2artist[sid] for pl, _ in all_playlists for sid in pl \
if (sid not in test_songset) and (sid in song2artist)]))
In [26]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist_trndev)}
In [27]:
Na = len(all_artist_trndev)
Np = len(all_playlists)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
pl_artist = sorted(set([song2artist[sid] for sid in all_playlists[j][0] \
if (sid not in test_songset) and (sid in song2artist)]))
ix = [artist2index[aid] for aid in pl_artist]
Delta[ix, j] = 1
In [28]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A
assert np.all(np.isclose(ColloMat.diagonal(), Dsum))
In [29]:
print(len(Dsum), len(all_artist_trndev))
In [30]:
#type(ColloMat)
In [31]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))
WeightMat = np.multiply(ColloMat, NormMat)
In [32]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []
spreads_cagh = []
novelties_cagh = {top: dict() for top in TOPs}
ptops_cagh = []
# diversities_cagh = []
# artist_diversities_cagh = {top: [] for top in TOPs}
# genre_diversities_cagh = {top: [] for top in TOPs}
np.random.seed(0)
npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
if (j+1) % 10 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
if npos[j] < 1:
continue
y_true = Y_test[:, j].A.reshape(-1)
y_pred = np.zeros(y_true.shape)
pl = all_playlists[j][0]
artists = set([song2artist[sid] for sid in pl if (sid not in test_songset) and (sid in song2artist)])
assert len(artists) > 0
artists_ix = [artist2index[aid] for aid in artists]
for ix in range(Y_test.shape[0]):
sid = index2song_test[ix]
if sid in song2artist:
aid = song2artist[sid]
if aid in artist2pop:
aix = artist2index[aid]
y_pred[ix] = np.log(artist2pop[aid]) * WeightMat[aix, artists_ix].sum()
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_cagh.append(rp)
for top in TOPs:
hitrates_cagh[top].append(hr_dict[top])
aucs_cagh.append(auc)
# spread
y_pred_prob = softmax(y_pred)
spreads_cagh.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))
# novelty
sortix = np.argsort(-y_pred)
u = pl2u[j]
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:top]])
try:
novelties_cagh[top][u].append(nov)
except KeyError:
novelties_cagh[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos[j]
ptops_cagh.append(pt)
# compute diversity@100
# csd = 1. / cosine_similarity(X_test[sortix[:100], :])
# dist = pairwise_distance_hamming(Y_test_csr[sortix[:100], :])
# diversities_cagh.append((dist.sum() - np.trace(dist)) / (100 * 99))
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song_test[ix]] if index2song_test[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song_test[ix]] if index2song_test[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_cagh[top].append( diversity(artist_vec) )
# genre_diversities_cagh[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_cagh), Y_test.shape[1]))
In [33]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_cagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_cagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass
In [34]:
cagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh),
'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in hitrates_cagh},
'AUC': np.mean(aucs_cagh),
'Spread': np.mean(spreads_cagh),
'Novelty': {t: np.mean([np.mean(novelties_cagh[t][u])
for u in novelties_cagh[t]]) for t in TOPs},
'PTop': np.mean(ptops_cagh),
# 'Artist-Diversity': {t: np.mean(artist_diversities_cagh[t]) for t in TOPs},
# 'Genre-Diversity': {t: np.mean(genre_diversities_cagh[t]) for t in TOPs}},
},
'Test_All': {'R-Precision': rps_cagh,
'Hit-Rate': {top: hitrates_cagh[top] for top in TOPs},
'AUC': aucs_cagh,
'Spread': spreads_cagh,
'Novelty': novelties_cagh,
'PTop': ptops_cagh,
# 'Artist-Diversity': artist_diversities_cagh,
# 'Genre-Diversity': genre_diversities_cagh},
}}}
cagh_perf[dataset_name]['Test']
Out[34]:
In [35]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh.pkl')
print(fperf_cagh)
pkl.dump(cagh_perf, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))[dataset_name]['Test']
Out[35]:
Let $S \in \mathbb{R}^{M \times D}, P \in \mathbb{R}^{N \times D}, Y \in \mathbb{R}^{M \times N}$ be the latent factors of songs and playlists, respectively.
The optimisation objective: $ \begin{aligned} J = \sum{m=1}^M \sum{n=1}^N \left( y_{m,n} - \mathbf{s}_m^\top \mathbf{p}_n \right)^2
+ C \left( \sum_{m=1}^M \mathbf{s}_m^\top \mathbf{s}_m + \sum_{n=1}^N \mathbf{p}_n^\top \mathbf{p}_n \right)
\end{aligned}
$
Use alternating least squares optimisation method:
In [25]:
np.random.seed(0)
D = 300
C = 1e-5
n_sweeps = 200
M, N = Y_trndev.shape
S = np.random.rand(M, D)
P = np.random.rand(N, D)
# alternating least squares
for sweep in range(n_sweeps):
# fix S, optimise P
SS = np.dot(S.T, S) # D by D
np.fill_diagonal(SS, C + SS.diagonal())
P_new = np.dot(Y_trndev.transpose().dot(S), np.linalg.inv(SS).T) # N by D
pdiff = (P_new - P).ravel()
P = P_new
# fix P, optimise S
PP = np.dot(P.T, P) # D by D
np.fill_diagonal(PP, C + PP.diagonal())
S_new = np.dot(Y_trndev.dot(P), np.linalg.inv(PP).T) # M by D
sdiff = (S_new - S).ravel()
S = S_new
print('P diff: {:8.6f}, S diff: {:8.6f}'.format(np.sqrt(pdiff.dot(pdiff)), np.sqrt(sdiff.dot(sdiff))))
Sanity check, RMSE
In [26]:
loss = 0.
Y_trndev_coo = Y_trndev.tocoo()
for row, col in zip(Y_trndev_coo.row, Y_trndev_coo.col):
diff = S[row, :].dot(P[col, :]) - 1
loss += diff * diff
loss /= Y_trndev_coo.nnz
print('RMSE:', np.sqrt(loss))
Learn an MLP to map song features to song latent factors, adapted from here
In [27]:
tf.set_random_seed(0)
if dataset_name == 'aotm2011':
batch_size = 8192
n_hidden = 512
n_epochs = 20
else:
batch_size = 1024
n_hidden = 512
n_epochs = 40
input_shape = (batch_size, X_trndev.shape[1])
dimensions = D
model = tf.keras.Sequential()
model.add(
tf.layers.Dense(
units = n_hidden,
input_dim = X_trndev.shape[1],
activation = 'sigmoid'))
model.add(tf.layers.Dense(units = dimensions))
model.compile(loss=tf.keras.losses.MSE,
optimizer=tf.keras.optimizers.Adam())
model.fit(X_trndev, S, epochs=n_epochs, batch_size=batch_size)
Out[27]:
In [28]:
X_test_factors = model.predict(X_test, batch_size=X_test.shape[0])
X_test_factors.shape
Out[28]:
In [29]:
rps_mf = []
hitrates_mf = {top: [] for top in TOPs}
aucs_mf = []
spreads_mf = []
novelties_mf = {top: dict() for top in TOPs}
ptops_mf = []
# artist_diversities_mf = {top: [] for top in TOPs}
# genre_diversities_mf = {top: [] for top in TOPs}
np.random.seed(0)
npos = Y_test.sum(axis=0).A.reshape(-1)
assert Y_test.shape[0] == len(test_songs)
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
if npos[j] < 1:
continue
y_true = Y_test[:, j].A.reshape(-1)
y_pred = np.dot(X_test_factors, P[j])
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_mf.append(rp)
for top in TOPs:
hitrates_mf[top].append(hr_dict[top])
aucs_mf.append(auc)
# spread
y_pred_prob = softmax(y_pred)
spreads_mf.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))
# novelty
sortix = np.argsort(-y_pred)
u = pl2u[j]
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song_test[ix]]) for ix in sortix[:top]])
try:
novelties_mf[top][u].append(nov)
except KeyError:
novelties_mf[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos[j]
ptops_mf.append(pt)
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song_test[ix]] if index2song_test[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song_test[ix]] if index2song_test[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_mf[top].append( diversity(artist_vec) )
# genre_diversities_mf[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_mf), Y_test.shape[1]))
In [30]:
perf_mf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_mf),
'Hit-Rate': {top: np.mean(hitrates_mf[top]) for top in TOPs},
'AUC': np.mean(aucs_mf),
'Spread': np.mean(spreads_mf),
'Novelty': {t: np.mean([np.mean(novelties_mf[t][u]) for u in novelties_mf[t]])
for t in TOPs},
'PTop': np.mean(ptops_mf),
# 'Artist-Diversity': {top: np.mean(artist_diversities_mf[top]) for top in TOPs},
# 'Genre-Diversity': {top: np.mean(genre_diversities_mf[top]) for top in TOPs}},
},
'Test_All': {'R-Precision': rps_mf,
'Hit-Rate': {top: hitrates_mf[top] for top in TOPs},
'AUC': aucs_mf,
'Spread': spreads_mf,
'Novelty': novelties_mf,
'PTop': ptops_mf,
# 'Artist-Diversity': artist_diversities_mf,
# 'Genre-Diversity': genre_diversities_mf}}}
}}}
perf_mf[dataset_name]['Test']
Out[30]:
In [31]:
fperf_mf = os.path.join(data_dir, 'perf-mf.pkl')
print(fperf_mf)
pkl.dump(perf_mf, open(fperf_mf, 'wb'))
pkl.load(open(fperf_mf, 'rb'))[dataset_name]['Test']
Out[31]: