In [1]:
%matplotlib inline
import os, sys, time, gzip
import pickle as pkl
import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix, issparse, csc_matrix, csr_matrix
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics, diversity, softmax
In [3]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 700, 1000]
In [4]:
datasets = ['aotm2011', '30music']
In [5]:
dix = 1
dataset_name = datasets[dix]
dataset_name
Out[5]:
In [6]:
data_dir = 'data/%s/coldstart/setting4' % dataset_name
X = pkl.load(gzip.open(os.path.join(data_dir, 'X.pkl.gz'), 'rb'))
Y_train = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))
song2pop_train = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop_train.pkl.gz'), 'rb'))
In [7]:
playlists3 = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_train_test_s4.pkl.gz'), 'rb'))
train_playlists = playlists3['train_playlists']
test_playlists = playlists3['test_playlists']
In [8]:
all_songs = pkl.load(gzip.open(os.path.join(data_dir, 'all_songs.pkl.gz'), 'rb'))
index2song = {ix: sid for ix, (sid, _) in enumerate(all_songs)}
In [9]:
song2index = {sid: ix for ix, (sid, _) in enumerate(all_songs)}
In [10]:
_song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
song2artist = {sid: _song2artist[sid] for sid, _ in all_songs if sid in _song2artist}
In [11]:
artist2songs = dict()
for sid in sorted(song2artist):
artist = song2artist[sid]
try:
artist2songs[artist].append(sid)
except KeyError:
artist2songs[artist] = [sid]
In [12]:
print('{:,} | {:,}'.format(len(song2artist), len(artist2songs)))
In [13]:
artist2pop = dict()
for pl, _ in train_playlists:
for sid in pl:
if sid in song2artist:
aid = song2artist[sid]
try:
artist2pop[aid] += 1
except KeyError:
artist2pop[aid] = 1
In [14]:
print(len(artist2pop))
In [15]:
song2genre = pkl.load(gzip.open('data/msd/song2genre.pkl.gz', 'rb'))
In [16]:
song2pop = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop.pkl.gz'), 'rb'))
Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:
$$
\text{sim}(a_1, a_2)
= \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)}
{\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}}
$$
where
$$
\delta(a, p)
= \begin{cases}
1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\
0, \ \text{otherwise}.
\end{cases}
$$
Recommend according to the popularity of songs, but weighted by similarity of (top 10 artists
, artist of song
).
In [17]:
all_artist = sorted(set([song2artist[sid] for pl, _ in train_playlists for sid in pl if sid in song2artist]))
In [18]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist)}
In [19]:
Na = len(all_artist)
Np = len(train_playlists)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
pl_artist = sorted(set([song2artist[sid] for sid in train_playlists[j][0] if sid in song2artist]))
ix = [artist2index[aid] for aid in pl_artist]
Delta[ix, j] = 1
In [20]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A
assert np.all(np.isclose(ColloMat.diagonal(), Dsum))
In [21]:
print(len(Dsum), len(all_artist))
In [22]:
#type(ColloMat)
In [23]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))
WeightMat = np.multiply(ColloMat, NormMat)
In [24]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []
novelties_cagh = {top: dict() for top in TOPs}
ptops_cagh = []
# artist_diversities_cagh = {top: [] for top in TOPs}
# genre_diversities_cagh = {top: [] for top in TOPs}
np.random.seed(0)
assert Y_test.shape[1] == len(test_playlists)
sid_legal = [sid for sid, _ in all_songs if sid in song2artist]
aix_legal = [artist2index[song2artist[sid]] for sid in sid_legal]
pop_legal = np.asarray([song2pop_train[sid] for sid in sid_legal])
ix_legal = [song2index[sid] for sid in sid_legal]
top10_artists = sorted(artist2pop, key=lambda aid: artist2pop[aid])[-10:]
top10_artists_ix = [artist2index[aix] for aix in top10_artists]
y_pred = np.zeros(Y_test.shape[0])
y_pred[ix_legal] = np.log(pop_legal) * np.asarray([WeightMat[aix, top10_artists_ix].sum() for aix in aix_legal])
y_pred_prob = softmax(y_pred)
spread_cagh = -np.dot(y_pred_prob, np.log(y_pred_prob))
sortix = np.argsort(-y_pred)
for j in range(Y_test.shape[1]):
if (j + 1) % 10 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y_true = Y_test[:, j].A.reshape(-1)
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_cagh.append(rp)
for top in TOPs:
hitrates_cagh[top].append(hr_dict[top])
aucs_cagh.append(auc)
# novelty
u = test_playlists[j][1]
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
try:
novelties_cagh[top][u].append(nov)
except KeyError:
novelties_cagh[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
npos = y_true.sum()
assert npos > 0
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos
ptops_cagh.append(pt)
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_cagh[top].append( diversity(artist_vec) )
# genre_diversities_cagh[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_cagh), Y_test.shape[1]))
In [25]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_cagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_cagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass
In [26]:
cagh = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh),
'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in TOPs},
'AUC': np.mean(aucs_cagh),
'Spread': spread_cagh,
'Novelty': {t: np.mean([np.mean(novelties_cagh[t][u])
for u in novelties_cagh[t]]) for t in TOPs},
'PTop': np.mean(ptops_cagh),
# 'Artist-Diversity': {t: np.mean(artist_diversities_cagh[t]) for t in TOPs},
# 'Genre-Diversity': {t: np.mean(genre_diversities_cagh[t]) for t in TOPs}},
},
'Test_All': {'R-Precision': rps_cagh,
'Hit-Rate': {top: hitrates_cagh[top] for top in TOPs},
'AUC': aucs_cagh,
'Spread': spread_cagh,
'Novelty': novelties_cagh,
'PTop': ptops_cagh,
# 'Artist-Diversity': artist_diversities_cagh,
# 'Genre-Diversity': genre_diversities_cagh}}}
}}}
cagh[dataset_name]['Test']
Out[26]:
In [27]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh.pkl')
print(fperf_cagh)
pkl.dump(cagh, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))[dataset_name]['Test']
Out[27]:
Recommending according to the popularity of songs of the top 10 most popular artists in data.
In [17]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []
novelties_sagh = {top: dict() for top in TOPs}
ptops_sagh = []
# artist_diversities_sagh = {top: [] for top in TOPs}
# genre_diversities_sagh = {top: [] for top in TOPs}
np.random.seed(0)
top10_artists = sorted(artist2pop, key=lambda aid: artist2pop[aid])[-10:]
candidates = []
for aix in top10_artists:
candidates += artist2songs[aix]
candidates = sorted(set(candidates))
assert len(candidates) > 0
y_pred = np.zeros(Y_test.shape[0])
for sid in candidates:
ix = song2index[sid]
y_pred[ix] = np.log(song2pop_train[sid])
y_pred_prob = softmax(y_pred)
spread_sagh = -np.dot(y_pred_prob, np.log(y_pred_prob))
sortix = np.argsort(-y_pred)
assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y_true = Y_test[:, j].A.reshape(-1)
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_sagh.append(rp)
for top in TOPs:
hitrates_sagh[top].append(hr_dict[top])
aucs_sagh.append(auc)
# novelty
u = test_playlists[j][1]
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
try:
novelties_sagh[top][u].append(nov)
except KeyError:
novelties_sagh[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
npos = y_true.sum()
assert npos > 0
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos
ptops_sagh.append(pt)
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_sagh[top].append( diversity(artist_vec) )
# genre_diversities_sagh[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_sagh), Y_test.shape[1]))
In [18]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_sagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_sagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass
In [19]:
sagh = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh),
'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in TOPs},
'AUC': np.mean(aucs_sagh),
'Spread': spread_sagh,
'Novelty': {t: np.mean([np.mean(novelties_sagh[t][u])
for u in novelties_sagh[t]]) for t in TOPs},
'PTop': np.mean(ptops_sagh),
# 'Artist-Diversity': {t: np.mean(artist_diversities_sagh[t]) for t in TOPs},
# 'Genre-Diversity': {t: np.mean(genre_diversities_sagh[t]) for t in TOPs}},
},
'Test_All': {'R-Precision': rps_sagh,
'Hit-Rate': {top: hitrates_sagh[top] for top in TOPs},
'AUC': aucs_sagh,
'Spread': spread_sagh,
'Novelty': novelties_sagh,
'PTop': ptops_sagh,
# 'Artist-Diversity': artist_diversities_sagh,
# 'Genre-Diversity': genre_diversities_sagh}}}
}}}
sagh[dataset_name]['Test']
Out[19]:
In [20]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh.pkl')
print(fperf_sagh)
pkl.dump(sagh, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))[dataset_name]['Test']
Out[20]:
In [17]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []
novelties_pop = {top: dict() for top in TOPs}
ptops_pop = []
# artist_diversities_pop = {top: [] for top in TOPs}
# genre_diversities_pop = {top: [] for top in TOPs}
np.random.seed(0)
y_pred = np.array([song2pop_train[index2song[ix]] for ix in range(len(all_songs))])
y_pred_prob = softmax(np.log(y_pred))
spread_pop = -np.dot(y_pred_prob, np.log(y_pred_prob))
sortix = np.argsort(-y_pred)
assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y_true = Y_test[:, j].A.reshape(-1)
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_pop.append(rp)
for top in TOPs:
hitrates_pop[top].append(hr_dict[top])
aucs_pop.append(auc)
# novelty
u = test_playlists[j][1]
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
try:
novelties_pop[top][u].append(nov)
except KeyError:
novelties_pop[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
npos = y_true.sum()
assert npos > 0
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos
ptops_pop.append(pt)
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_pop[top].append( diversity(artist_vec) )
# genre_diversities_pop[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_pop), Y_test.shape[1]))
In [18]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_pop, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_pop, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass
In [19]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop),
'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in TOPs},
'AUC': np.mean(aucs_pop),
'Spread': spread_pop,
'Novelty': {t: np.mean([np.mean(novelties_pop[t][u]) for u in novelties_pop[t]])
for t in TOPs},
'PTop': np.mean(ptops_pop),
#'Artist-Diversity': {top: np.mean(artist_diversities_pop[top]) for top in TOPs},
#'Genre-Diversity': {top: np.mean(genre_diversities_pop[top]) for top in TOPs}},
},
'Test_All': {'R-Precision': rps_pop,
'Hit-Rate': {top: hitrates_pop[top] for top in TOPs},
'AUC': aucs_pop,
'Spread': spread_pop,
'Novelty': novelties_pop,
'PTop': ptops_pop,
# 'Artist-Diversity': artist_diversities_pop,
# 'Genre-Diversity': genre_diversities_pop}}}
}}}
pop_perf[dataset_name]['Test']
Out[19]:
In [20]:
fperf_pop = os.path.join(data_dir, 'perf-pop.pkl')
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))[dataset_name]['Test']
Out[20]:
Let $S \in \mathbb{R}^{M \times D}, V \in \mathbb{R}^{U \times D}$ be the latent factors of songs and users (in training set), respectively. Let $R \in \mathbb{R}^{M \times U}$ be the play-count of songs for users (in training set), and $T = \mathbf{1}(R > 0) \in \{0,1\}^{M \times U}$ be a binary matrix.
The optimisation objective: $ \begin{aligned} J = \sum{m=1}^M \sum{u=1}^U q{m, u} \left( t{m,u} - \mathbf{s}_m^\top \mathbf{v}_u \right)^2
+ C \left( \sum_{m=1}^M \mathbf{s}_m^\top \mathbf{s}_m + \sum_{u=1}^U \mathbf{v}_n^\top \mathbf{v}_n \right)
\end{aligned} $ where $\alpha, \epsilon$ are hyper-parameters, and $q{m, u} = 1 + \alpha \log(1 + \epsilon^{-1} r{m,u})$.
Use alternating least squares optimisation method:
In [17]:
assert dataset_name == '30music'
In [18]:
train_users = sorted({u for _, u in train_playlists})
train_user2index = {u: uix for uix, u in enumerate(train_users)}
print(len(train_users))
In [19]:
M = X.shape[0]
U = len(train_users)
R = np.zeros((M, U))
for pix, (_, u) in tqdm(enumerate(train_playlists)):
R[:, train_user2index[u]] += Y_train[:, pix].A.reshape(-1)
R.shape
Out[19]:
In [20]:
T = (R > 0).astype(np.float32)
T.shape
Out[20]:
In [21]:
assert torch.cuda.is_available()
device = torch.device('cuda:0')
In [22]:
D = 100
C = 1e-5
alpha = 40
eps = 1e-8
torch.manual_seed(0)
S = torch.rand(M, D).to(device)
V = torch.rand(U, D).to(device)
Q = 1 + alpha * (np.log(1 / eps) + np.log(eps + R)) # M by U, dense
QT = np.multiply(Q, T).astype(np.float32) # M by U
QT_csc = csc_matrix(QT)
QT_csr = csr_matrix(QT)
Q = torch.from_numpy(Q.astype(np.float32)).to(device)
del QT
In [23]:
n_sweeps = 20
# alternating least squares
for sweep in range(n_sweeps):
t0 = time.time()
vdiff2 = 0.
sdiff2 = 0.
# fix S, optimise V
S_cpu = S.cpu().numpy()
for uix in range(U):
if (uix + 1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (uix+1, U))
sys.stdout.flush()
QSu = torch.mm((Q[:, uix].reshape(M, 1) * S).t(), S) # D by D
QSu[range(D), range(D)] = C + QSu.diag()
QTuS = torch.from_numpy(QT_csc[:, uix].transpose().dot(S_cpu).reshape(D, 1)).to(device)
v_u = torch.mm(torch.inverse(QSu), QTuS).reshape(-1)
diff = v_u - V[uix, :]
vdiff2 += torch.mm(diff.reshape(1, -1), diff.reshape(-1, 1)).cpu().item()
V[uix, :] = v_u
print()
# fix V, optimise S
V_cpu = V.cpu().numpy()
for m in range(M):
if (m + 1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (m+1, M))
sys.stdout.flush()
QVm = torch.mm((Q[m, :].reshape(U, 1) * V).t(), V) # D by D
QVm[range(D), range(D)] = C + QVm.diag()
QTmV = torch.from_numpy(QT_csr[m, :].dot(V_cpu).reshape(D, 1)).to(device)
s_m = torch.mm(torch.inverse(QVm), QTmV).reshape(-1)
diff = s_m - S[m, :]
sdiff2 += torch.mm(diff.reshape(1, -1), diff.reshape(-1, 1)).cpu().item()
S[m, :] = s_m
print('\nV diff: {:8.6f}, S diff: {:8.6f} in {:.1f} seconds.'
.format(np.sqrt(vdiff2), np.sqrt(sdiff2), time.time() - t0))
Sanity check: compute optimisation objective.
In [24]:
cost = np.multiply(Q.cpu().numpy(), np.square(T - torch.mm(S, V.t()).cpu().numpy())).sum() \
+ C * (torch.mul(S, S).sum().cpu().item() + torch.mul(V, V).sum().cpu().item())
del T
print('Cost: %g' % cost)
In [25]:
all_users = pd.read_csv('data/30music/users.csv', index_col='ID', sep=';')
all_users.head()
Out[25]:
In [26]:
train_users_df = all_users.loc[train_users]
train_users_df.shape
Out[26]:
Countries
In [27]:
country_set = set(train_users_df['Country'])
# country_set
In [28]:
country_set.remove(np.nan)
In [29]:
countries = sorted(country_set)
country2index = {c: ix for ix, c in enumerate(countries)}
Gender
In [30]:
gender_set = set(train_users_df['Gender'])
# gender_set
In [31]:
gender_set.remove(np.nan)
In [32]:
genders = sorted(gender_set)
gender2index = {g: ix for ix, g in enumerate(genders)}
Subscriber type
In [33]:
subscribe_set = set(train_users_df['Subscribertype'])
# subscribe_set
In [34]:
subscribes = sorted(subscribe_set)
subscribe2index = {s: ix for ix, s in enumerate(subscribes)}
Features: Age, Country, Gender, Subscribertype
where we
Age
Country
, Gender
and Subscribertype
In [35]:
# `#Playlists`
#num_playlists = np.log10(train_users_df['#Playlists'])
#num_playlists_mean = np.mean(num_playlists)
#num_playlists_std = np.std(num_playlists)
#num_playlists = (num_playlists - num_playlists_mean) / num_playlists_std
# Age
ages = np.array([0 if np.isnan(x) or x < 1 else x for x in train_users_df['Age']])
ages_mean = np.mean(ages)
ages_std = np.std(ages)
ages = (ages - ages_mean) / ages_std
# Country
country_mat = np.zeros((len(train_users), len(countries)))
for uix, u in enumerate(train_users):
c = train_users_df.loc[u, 'Country']
if c in country2index:
cix = country2index[c]
country_mat[uix, cix] = 1
# Gender
gender_mat = np.zeros((len(train_users), len(genders)))
for uix, u in enumerate(train_users):
g = train_users_df.loc[u, 'Gender']
if g in gender2index:
gix = gender2index[g]
gender_mat[uix, gix] = 1
# Playcount
#playcounts = np.array([0 if np.isnan(x) or x < 1 else np.log10(x) for x in train_users_df['Playcount']])
#playcounts_mean = np.mean(playcounts)
#playcounts_std = np.std(playcounts)
#playcounts = (playcounts - playcounts_mean) / playcounts_std
# Subscribertype
subscribe_mat = np.zeros((len(train_users), len(subscribes)))
for uix, u in enumerate(train_users):
s = train_users_df.loc[u, 'Subscribertype']
six = subscribe2index[s]
subscribe_mat[uix, six] = 1
In [36]:
user_features = np.hstack([ages.reshape(-1, 1), country_mat, gender_mat, subscribe_mat])
user_features.shape
Out[36]:
In [37]:
test_users = {u for _, u in test_playlists}
test_users = sorted(test_users - (test_users - set(all_users.index)))
test_user2index = {u: uix for uix, u in enumerate(test_users)}
In [38]:
# set(test_users) - set(all_users.index)
In [39]:
test_users_df = all_users.loc[test_users]
test_users_df.shape
Out[39]:
In [40]:
# `#Playlists`
#num_playlists_test = np.log10(test_users_df['#Playlists'])
#num_playlists_test = (num_playlists_test - num_playlists_mean) / num_playlists_std
# Age
ages_test = np.array([0 if np.isnan(x) or x < 1 else x for x in test_users_df['Age']])
ages_test = (ages_test - ages_mean) / ages_std
# Country
country_mat_test = np.zeros((len(test_users), len(countries)))
for uix, u in enumerate(test_users):
c = test_users_df.loc[u, 'Country']
if c in country2index:
cix = country2index[c]
country_mat_test[uix, cix] = 1
# Gender
gender_mat_test = np.zeros((len(test_users), len(genders)))
for uix, u in enumerate(test_users):
g = test_users_df.loc[u, 'Gender']
if g in gender2index:
gix = gender2index[g]
gender_mat_test[uix, gix] = 1
# Playcount
#playcounts_test = np.array([0 if np.isnan(x) or x < 1 else np.log10(x) for x in test_users_df['Playcount']])
#playcounts_test = (playcounts_test - playcounts_mean) / playcounts_std
# Subscribertype
subscribe_mat_test = np.zeros((len(test_users), len(subscribes)))
for uix, u in enumerate(test_users):
s = test_users_df.loc[u, 'Subscribertype']
six = subscribe2index[s]
subscribe_mat_test[uix, six] = 1
In [41]:
user_features_test = np.hstack([ages_test.reshape(-1, 1), country_mat_test, gender_mat_test, subscribe_mat_test])
user_features_test.shape
Out[41]:
Cosine similarity between test user features and train user features
In [42]:
dot_prod = np.dot(user_features_test, user_features.T)
dot_prod.shape
Out[42]:
In [43]:
norm_train = np.multiply(user_features, user_features).sum(axis=1).reshape(-1, 1)
norm_test = np.multiply(user_features_test, user_features_test).sum(axis=1).reshape(-1, 1)
norm_mat = np.dot(norm_test, norm_train.T)
norm_mat.shape
Out[43]:
In [44]:
print(np.sum(norm_mat.ravel() > 0), '=?', np.prod(norm_mat.shape))
In [45]:
cosine_similarities = np.divide(dot_prod, norm_mat)
In [47]:
pkl.dump([train_user2index, test_user2index, cosine_similarities], gzip.open('%s/user_sim.pkl.gz' % data_dir, 'wb'))
Use the average factor of 100 nearest neighbours for a test user
In [67]:
k = 200
rps_mf = []
hitrates_mf = {top: [] for top in TOPs}
aucs_mf = []
spreads_mf = []
novelties_mf = {top: dict() for top in TOPs}
ptops_mf = []
# artist_diversities_mf = {top: [] for top in TOPs}
# genre_diversities_mf = {top: [] for top in TOPs}
np.random.seed(0)
npos = Y_test.sum(axis=0).A.reshape(-1)
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
assert npos[j] > 0
y_true = Y_test[:, j].A.reshape(-1)
u = test_playlists[j][1]
if u not in test_users:
continue
# average factor of 10 nearest neighbours
uix = test_user2index[u]
factor_ix = np.argpartition(cosine_similarities[uix, :].reshape(-1), -k)[-k:]
y_pred = torch.mm(S, V[factor_ix, :].mean(dim=0).reshape(D, 1)).reshape(-1).cpu().numpy()
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_mf.append(rp)
for top in TOPs:
hitrates_mf[top].append(hr_dict[top])
# auc = roc_auc_score(y_true, y_pred)
aucs_mf.append(auc)
# spread
y_pred_prob = softmax(y_pred)
spreads_mf.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))
# novelty
sortix = np.argsort(-y_pred)
for top in TOPs:
nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
try:
novelties_mf[top][u].append(nov)
except KeyError:
novelties_mf[top][u] = [nov]
# PTop: (#pos ranked above the top-ranked negative) / #pos
assert y_true.dtype == np.bool
negIx = (1 - y_true).astype(np.bool)
negMax = y_pred[negIx].max()
pt = (y_pred[y_true] > negMax).sum() / npos[j]
ptops_mf.append(pt)
# artist/genre diversity
# for top in TOPs:
# artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
# else str(np.random.rand()) for ix in sortix[:top]])
# genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
# else str(np.random.rand()) for ix in sortix[:top]])
# artist_diversities_mf[top].append( diversity(artist_vec) )
# genre_diversities_mf[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_mf), Y_test.shape[1]))
In [68]:
# print('AUC:', np.mean(aucs_mf))
# print({top: np.mean(hitrates_mf[top]) for top in TOPs})
In [69]:
perf_mf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_mf),
'Hit-Rate': {top: np.mean(hitrates_mf[top]) for top in TOPs},
'AUC': np.mean(aucs_mf),
'Spread': np.mean(spreads_mf),
'Novelty': {t: np.mean([np.mean(novelties_mf[t][u]) for u in novelties_mf[t]])
for t in TOPs},
'PTop': np.mean(ptops_mf),
# 'Artist-Diversity': {top: np.mean(artist_diversities_mf[top]) for top in TOPs},
# 'Genre-Diversity': {top: np.mean(genre_diversities_mf[top]) for top in TOPs}},
},
'Test_All': {'R-Precision': rps_mf,
'Hit-Rate': {top: hitrates_mf[top] for top in TOPs},
'AUC': aucs_mf,
'Spread': spreads_mf,
'Novelty': novelties_mf,
'PTop': ptops_mf,
# 'Artist-Diversity': artist_diversities_mf,
# 'Genre-Diversity': genre_diversities_mf}}}
}}}
perf_mf[dataset_name]['Test']
Out[69]:
In [9]:
fperf_mf = os.path.join(data_dir, 'perf-mf.pkl')
print(fperf_mf)
pkl.dump(perf_mf, open(fperf_mf, 'wb'))
pkl.load(open(fperf_mf, 'rb'))[dataset_name]['Test']
Out[9]: