In [1]:
%matplotlib inline
import os, sys, time, gzip
import pickle as pkl
import numpy as np
from scipy.sparse import lil_matrix, issparse, csc_matrix, csr_matrix
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
In [2]:
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics, diversity, softmax
# from tools import calc_Precision_Recall
In [3]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 700, 1000]
In [4]:
datasets = ['aotm2011', '30music']
In [5]:
dix = 1
dataset_name = datasets[dix]
dataset_name
Out[5]:
In [6]:
data_dir = 'data/%s/coldstart/setting3' % dataset_name
X = pkl.load(gzip.open(os.path.join(data_dir, 'X.pkl.gz'), 'rb'))
Y_train = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))
song2pop_train = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop_train.pkl.gz'), 'rb'))
In [7]:
playlists3 = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_train_test_s3.pkl.gz'), 'rb'))
train_playlists = playlists3['train_playlists']
test_playlists = playlists3['test_playlists']
user2songs = dict()
for pl, u in train_playlists:
try:
user2songs[u].update(set(pl))
except KeyError:
user2songs[u] = set(pl)
In [8]:
all_songs = pkl.load(gzip.open(os.path.join(data_dir, 'all_songs.pkl.gz'), 'rb'))
index2song = {ix: sid for ix, (sid, _) in enumerate(all_songs)}
In [9]:
song2index = {sid: ix for ix, (sid, _) in enumerate(all_songs)}
In [10]:
_song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
song2artist = {sid: _song2artist[sid] for sid, _ in all_songs if sid in _song2artist}
In [11]:
artist2songs = dict()
for sid in sorted(song2artist):
artist = song2artist[sid]
try:
artist2songs[artist].append(sid)
except KeyError:
artist2songs[artist] = [sid]
In [12]:
print('{:,} | {:,}'.format(len(song2artist), len(artist2songs)))
In [13]:
song2genre = pkl.load(gzip.open('data/msd/song2genre.pkl.gz', 'rb'))
In [14]:
song2pop = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop.pkl.gz'), 'rb'))
Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:
$$
\text{sim}(a_1, a_2)
= \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)}
{\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}}
$$
where
$$
\delta(a, p)
= \begin{cases}
1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\
0, \ \text{otherwise}.
\end{cases}
$$
Recommend according to the popularity of songs, but weighted by similarity of (artist in user's listening history
, artist of song
).
In [15]:
all_artist = sorted(set([song2artist[sid] for pl, _ in train_playlists for sid in pl if sid in song2artist]))
In [16]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist)}
In [17]:
Na = len(all_artist)
Np = len(train_playlists)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
pl_artist = sorted(set([song2artist[sid] for sid in train_playlists[j][0] if sid in song2artist]))
ix = [artist2index[aid] for aid in pl_artist]
Delta[ix, j] = 1
In [18]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A
assert np.all(np.isclose(ColloMat.diagonal(), Dsum))
In [19]:
print(len(Dsum), len(all_artist))
In [20]:
#type(ColloMat)
In [21]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))
WeightMat = np.multiply(ColloMat, NormMat)
In [22]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []
spreads_cagh = []
novelties_cagh = {top: dict() for top in TOPs}
ptops_cagh = []
# artist_diversities_cagh = {top: [] for top in TOPs}
# genre_diversities_cagh = {top: [] for top in TOPs}
np.random.seed(0)
assert Y_test.shape[1] == len(test_playlists)
sid_legal = [sid for sid, _ in all_songs if sid in song2artist]
aix_legal = [artist2index[song2artist[sid]] for sid in sid_legal]
pop_legal = np.asarray([song2pop_train[sid] for sid in sid_legal])
ix_legal = [song2index[sid] for sid in sid_legal]
prev_u = None
prev_y = None
for j in range(Y_test.shape[1]):
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y_true = Y_test[:, j].A.reshape(-1)
u = test_playlists[j][1]
if prev_u is None or prev_u != u:
artists = sorted(set([song2artist[sid] for sid in user2songs[u] if sid in song2artist]))
artists_ix = [artist2index[aid] for aid in artists]
y_pred = np.zeros(y_true.shape)
y_pred[ix_legal] = np.log(pop_legal) * np.asarray([WeightMat[aix, artists_ix].sum() for aix in aix_legal])
# for ix in ix_legal:
# sid = index2song[ix]
# aix = artist2index[song2artist[sid]]
# pop = song2pop_test[sid]
# y_pred[ix] = pop * WeightMat[aix, artists_ix].sum()
prev_u = u
prev_y = y_pred
else:
y_pred = prev_y
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_cagh.append(rp)
for top in TOPs:
hitrates_cagh[top].append(hr_dict[top])
aucs_cagh.append(auc)
# spread
y_pred_prob = softmax(y_pred)
spreads_cagh.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))
# novelty
sortix = np.argsort(-y_pred)
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
try:
novelties_cagh[top][u].append(nov)
except KeyError:
novelties_cagh[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
npos = y_true.sum()
assert npos > 0
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos
ptops_cagh.append(pt)
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_cagh[top].append( diversity(artist_vec) )
# genre_diversities_cagh[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_cagh), Y_test.shape[1]))
In [23]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_cagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_cagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass
In [24]:
cagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh),
'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in TOPs},
'AUC': np.mean(aucs_cagh),
'Spread': np.mean(spreads_cagh),
'Novelty': {t: np.mean([np.mean(novelties_cagh[t][u])
for u in novelties_cagh[t]]) for t in TOPs},
'PTop': np.mean(ptops_cagh),
# 'Artist-Diversity': {t: np.mean(artist_diversities_cagh[t]) for t in TOPs},
# 'Genre-Diversity': {t: np.mean(genre_diversities_cagh[t]) for t in TOPs}},
},
'Test_All': {'R-Precision': rps_cagh,
'Hit-Rate': {top: hitrates_cagh[top] for top in TOPs},
'AUC': aucs_cagh,
'Spread': spreads_cagh,
'Novelty': novelties_cagh,
'PTop': ptops_cagh,
# 'Artist-Diversity': artist_diversities_cagh,
# 'Genre-Diversity': genre_diversities_cagh}}}
}}}
cagh_perf[dataset_name]['Test']
Out[24]:
In [25]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh.pkl')
print(fperf_cagh)
pkl.dump(cagh_perf, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))[dataset_name]['Test']
Out[25]:
Recommend according to the popularity of songs of artists in listening history.
In [15]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []
spreads_sagh = []
novelties_sagh = {top: dict() for top in TOPs}
ptops_sagh = []
# artist_diversities_sagh = {top: [] for top in TOPs}
# genre_diversities_sagh = {top: [] for top in TOPs}
np.random.seed(0)
assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y_true = Y_test[:, j].A.reshape(-1)
y_pred = np.zeros(y_true.shape)
u = test_playlists[j][1]
artists = sorted(set([song2artist[sid] for sid in user2songs[u] if sid in song2artist]))
candidates = []
for a in artists:
candidates += artist2songs[a]
candidates = sorted(set(candidates))
if len(candidates) > 0:
for sid in candidates:
ix = song2index[sid]
y_pred[ix] = np.log(song2pop_train[sid])
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_sagh.append(rp)
for top in TOPs:
hitrates_sagh[top].append(hr_dict[top])
aucs_sagh.append(auc)
# spread
y_pred_prob = softmax(y_pred)
spreads_sagh.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))
# novelty
sortix = np.argsort(-y_pred)
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
try:
novelties_sagh[top][u].append(nov)
except KeyError:
novelties_sagh[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
npos = y_true.sum()
assert npos > 0
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos
ptops_sagh.append(pt)
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_sagh[top].append( diversity(artist_vec) )
# genre_diversities_sagh[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_sagh), Y_test.shape[1]))
In [16]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_sagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_sagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass
In [17]:
sagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh),
'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in TOPs},
'AUC': np.mean(aucs_sagh),
'Spread': np.mean(spreads_sagh),
'Novelty': {t: np.mean([np.mean(novelties_sagh[t][u])
for u in novelties_sagh[t]]) for t in TOPs},
'PTop': np.mean(ptops_sagh),
# 'Artist-Diversity': {t: np.mean(artist_diversities_sagh[t]) for t in TOPs},
# 'Genre-Diversity': {t: np.mean(genre_diversities_sagh[t]) for t in TOPs}},
},
'Test_All': {'R-Precision': rps_sagh,
'Hit-Rate': {top: hitrates_sagh[top] for top in TOPs},
'AUC': aucs_sagh,
'Spread': spreads_sagh,
'Novelty': novelties_sagh,
'PTop': ptops_sagh,
# 'Artist-Diversity': artist_diversities_sagh,
# 'Genre-Diversity': genre_diversities_sagh}}}
}}}
sagh_perf[dataset_name]['Test']
Out[17]:
In [18]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh.pkl')
print(fperf_sagh)
pkl.dump(sagh_perf, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))[dataset_name]['Test']
Out[18]:
In [15]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []
novelties_pop = {top: dict() for top in TOPs}
ptops_pop = []
# artist_diversities_pop = {top: [] for top in TOPs}
# genre_diversities_pop = {top: [] for top in TOPs}
np.random.seed(0)
y_pred = np.array([song2pop_train[index2song[ix]] for ix in range(len(all_songs))])
y_pred_prob = softmax(np.log(y_pred))
spread_pop = -np.dot(y_pred_prob, np.log(y_pred_prob))
sortix = np.argsort(-y_pred)
assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
if (j+1) % 10 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y_true = Y_test[:, j].A.reshape(-1)
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_pop.append(rp)
for top in TOPs:
hitrates_pop[top].append(hr_dict[top])
aucs_pop.append(auc)
# novelty
u = test_playlists[j][1]
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
try:
novelties_pop[top][u].append(nov)
except KeyError:
novelties_pop[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
npos = y_true.sum()
assert npos > 0
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos
ptops_pop.append(pt)
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_pop[top].append( diversity(artist_vec) )
# genre_diversities_pop[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_pop), Y_test.shape[1]))
In [16]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_pop, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_pop, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass
In [17]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop),
'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in TOPs},
'AUC': np.mean(aucs_pop),
'Spread': spread_pop,
'Novelty': {t: np.mean([np.mean(novelties_pop[t][u]) for u in novelties_pop[t]])
for t in TOPs},
'PTop': np.mean(ptops_pop),
#'Artist-Diversity': {top: np.mean(artist_diversities_pop[top]) for top in TOPs},
#'Genre-Diversity': {top: np.mean(genre_diversities_pop[top]) for top in TOPs}},
},
'Test_All': {'R-Precision': rps_pop,
'Hit-Rate': {top: hitrates_pop[top] for top in TOPs},
'AUC': aucs_pop,
'Spread': spread_pop,
'Novelty': novelties_pop,
'PTop': ptops_pop,
# 'Artist-Diversity': artist_diversities_pop,
# 'Genre-Diversity': genre_diversities_pop}}}
}}}
pop_perf[dataset_name]['Test']
Out[17]:
In [18]:
fperf_pop = os.path.join(data_dir, 'perf-pop.pkl')
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))[dataset_name]['Test']
Out[18]:
Let $S \in \mathbb{R}^{M \times D}, V \in \mathbb{R}^{U \times D}$ be the latent factors of songs and users, respectively. Let $R \in \mathbb{R}^{M \times U}$ be the play-count of songs for users, and $T = \mathbf{1}(R > 0) \in \{0,1\}^{M \times U}$ be a binary matrix.
The optimisation objective: $ \begin{aligned} J = \sum{m=1}^M \sum{u=1}^U q{m, u} \left( t{m,u} - \mathbf{s}_m^\top \mathbf{v}_u \right)^2
+ C \left( \sum_{m=1}^M \mathbf{s}_m^\top \mathbf{s}_m + \sum_{u=1}^U \mathbf{v}_n^\top \mathbf{v}_n \right)
\end{aligned} $ where $\alpha, \epsilon$ are hyper-parameters, and $q{m, u} = 1 + \alpha \log(1 + \epsilon^{-1} r{m,u})$.
Use alternating least squares optimisation method:
In [15]:
users = sorted({u for _, u in train_playlists})
user2index = {u: uix for uix, u in enumerate(users)}
print(len(users))
In [16]:
M = X.shape[0]
U = len(users)
R = np.zeros((M, U))
for pix, (_, u) in tqdm(enumerate(train_playlists)):
R[:, user2index[u]] += Y_train[:, pix].A.reshape(-1)
R.shape
Out[16]:
In [17]:
T = (R > 0).astype(np.float32)
T.shape
Out[17]:
In [18]:
assert torch.cuda.is_available()
device = torch.device('cuda:0')
In [19]:
D = 100
C = 1e-5
alpha = 40
eps = 1e-8
torch.manual_seed(0)
S = torch.rand(M, D).to(device)
V = torch.rand(U, D).to(device)
Q = 1 + alpha * (np.log(1 / eps) + np.log(eps + R)) # M by U, dense
QT = np.multiply(Q, T).astype(np.float32) # M by U
QT_csc = csc_matrix(QT)
QT_csr = csr_matrix(QT)
Q = torch.from_numpy(Q.astype(np.float32)).to(device)
del QT
In [20]:
n_sweeps = 20
# alternating least squares
for sweep in range(n_sweeps):
t0 = time.time()
vdiff2 = 0.
sdiff2 = 0.
# fix S, optimise V
S_cpu = S.cpu().numpy()
for uix in range(U):
if (uix + 1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (uix+1, U))
sys.stdout.flush()
QSu = torch.mm((Q[:, uix].reshape(M, 1) * S).t(), S) # D by D
QSu[range(D), range(D)] = C + QSu.diag()
QTuS = torch.from_numpy(QT_csc[:, uix].transpose().dot(S_cpu).reshape(D, 1)).to(device)
v_u = torch.mm(torch.inverse(QSu), QTuS).reshape(-1)
diff = v_u - V[uix, :]
vdiff2 += torch.mm(diff.reshape(1, -1), diff.reshape(-1, 1)).cpu().item()
V[uix, :] = v_u
print()
# fix V, optimise S
V_cpu = V.cpu().numpy()
for m in range(M):
if (m + 1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (m+1, M))
sys.stdout.flush()
QVm = torch.mm((Q[m, :].reshape(U, 1) * V).t(), V) # D by D
QVm[range(D), range(D)] = C + QVm.diag()
QTmV = torch.from_numpy(QT_csr[m, :].dot(V_cpu).reshape(D, 1)).to(device)
s_m = torch.mm(torch.inverse(QVm), QTmV).reshape(-1)
diff = s_m - S[m, :]
sdiff2 += torch.mm(diff.reshape(1, -1), diff.reshape(-1, 1)).cpu().item()
S[m, :] = s_m
print('\nV diff: {:8.6f}, S diff: {:8.6f} in {:.1f} seconds.'
.format(np.sqrt(vdiff2), np.sqrt(sdiff2), time.time() - t0))
Sanity check: compute optimisation objective.
In [21]:
cost = np.multiply(Q.cpu().numpy(), np.square(T - torch.mm(S, V.t()).cpu().numpy())).sum() \
+ C * (torch.mul(S, S).sum().cpu().item() + torch.mul(V, V).sum().cpu().item())
print('Cost: %g' % cost)
In [22]:
# del QT
# T = torch.from_numpy(T).to(device)
# cost = torch.mul(Q, (T - torch.mm(S, V.t())) ** 2).sum() + C * (torch.mul(S, S).sum() + torch.mul(V, V).sum())
# print('Cost: %g' % cost.cpu().item())
In [23]:
del T
Performance on test set
In [25]:
rps_mf = []
hitrates_mf = {top: [] for top in TOPs}
aucs_mf = []
spreads_mf = []
novelties_mf = {top: dict() for top in TOPs}
ptops_mf = []
# artist_diversities_mf = {top: [] for top in TOPs}
# genre_diversities_mf = {top: [] for top in TOPs}
np.random.seed(0)
npos = Y_test.sum(axis=0).A.reshape(-1)
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
assert npos[j] > 0
y_true = Y_test[:, j].A.reshape(-1)
u = test_playlists[j][1]
uix = user2index[u]
y_pred = torch.mm(S, V[uix, :].reshape(D, 1)).reshape(-1).cpu().numpy()
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_mf.append(rp)
for top in TOPs:
hitrates_mf[top].append(hr_dict[top])
# auc = roc_auc_score(y_true, y_pred)
aucs_mf.append(auc)
# spread
y_pred_prob = softmax(y_pred)
spreads_mf.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))
# novelty
sortix = np.argsort(-y_pred)
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
try:
novelties_mf[top][u].append(nov)
except KeyError:
novelties_mf[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos[j]
ptops_mf.append(pt)
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_mf[top].append( diversity(artist_vec) )
# genre_diversities_mf[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_mf), Y_test.shape[1]))
In [26]:
perf_mf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_mf),
'Hit-Rate': {top: np.mean(hitrates_mf[top]) for top in TOPs},
'AUC': np.mean(aucs_mf),
'Spread': np.mean(spreads_mf),
'Novelty': {t: np.mean([np.mean(novelties_mf[t][u]) for u in novelties_mf[t]])
for t in TOPs},
'PTop': np.mean(ptops_mf),
# 'Artist-Diversity': {top: np.mean(artist_diversities_mf[top]) for top in TOPs},
# 'Genre-Diversity': {top: np.mean(genre_diversities_mf[top]) for top in TOPs}},
},
'Test_All': {'R-Precision': rps_mf,
'Hit-Rate': {top: hitrates_mf[top] for top in TOPs},
'AUC': aucs_mf,
'Spread': spreads_mf,
'Novelty': novelties_mf,
'PTop': ptops_mf,
# 'Artist-Diversity': artist_diversities_mf,
# 'Genre-Diversity': genre_diversities_mf}}}
}}}
perf_mf[dataset_name]['Test']
Out[26]:
In [27]:
fperf_mf = os.path.join(data_dir, 'perf-mf.pkl')
print(fperf_mf)
pkl.dump(perf_mf, open(fperf_mf, 'wb'))
pkl.load(open(fperf_mf, 'rb'))[dataset_name]['Test']
Out[27]: