In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, time, gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from scipy.sparse import lil_matrix, issparse
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
from models import MTC
from sklearn.linear_model import LogisticRegression
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics
In [ ]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]
In [ ]:
datasets = ['aotm2011', '30music']
In [ ]:
dix = 0
dataset_name = datasets[dix]
dataset_name
In [ ]:
# X = pkl.load(gzip.open(os.path.join(base_dir, 'X_train_dev.pkl.gz'), 'rb'))
# Y = pkl.load(gzip.open(os.path.join(base_dir, 'Y.pkl.gz'), 'rb'))
# PU_test = pkl.load(gzip.open(os.path.join(base_dir, 'PU_test.pkl.gz'), 'rb'))
# playlists2 = pkl.load(gzip.open(os.path.join(base_dir, 'playlists_train_dev_test_s2.pkl.gz'), 'rb'))
In [ ]:
n_seed = 1
data_dir = 'data/%s/setting2' % dataset_name
X = pkl.load(gzip.open(os.path.join(data_dir, 'X_trndev_%d.pkl.gz' % n_seed), 'rb'))
Y = pkl.load(gzip.open(os.path.join(data_dir, 'Y.pkl.gz'), 'rb'))
PU_test = pkl.load(gzip.open(os.path.join(data_dir, 'PU_test_%d.pkl.gz' % n_seed), 'rb'))
song2pop = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop.pkl.gz'), 'rb'))
playlists2 = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_train_dev_test_s2_%d.pkl.gz' % n_seed), 'rb'))
In [ ]:
Y_test = Y[:, -PU_test.shape[1]:]
print(Y_test.shape)
#Y_test.sum(axis=0)
In [ ]:
all_songs = pkl.load(gzip.open(os.path.join(data_dir, 'all_songs.pkl.gz'), 'rb'))
index2song = {ix: sid for ix, (sid, _) in enumerate(all_songs)}
In [ ]:
song2index = {sid: ix for ix, (sid, _) in enumerate(all_songs)}
In [ ]:
song2pop_test = song2pop.copy()
for ppl in playlists2['test_playlists_held']:
for sid in ppl:
song2pop_test[sid] -= 1
In [ ]:
X_train = np.asarray([song2pop[sid] for sid, _ in all_songs], dtype=np.float).reshape(len(all_songs), 1)
X_train_mean = np.mean(X_train, axis=0)
X_train_std = np.std(X_train, axis=0) + 1e-6
X_train -= X_train_mean
X_train /= X_train_std
pkl.dump(X_train, gzip.open(os.path.join(data_dir, 'X_trndev_pop_%d.pkl.gz' % n_seed), 'wb'))
In [ ]:
song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
In [ ]:
artist2song = dict()
for sid in sorted(song2artist):
artist = song2artist[sid]
try:
artist2song[artist].append(sid)
except KeyError:
artist2song[artist] = [sid]
In [ ]:
print('{:,} | {:,}'.format(len(song2artist), len(artist2song)))
Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:
$$
\text{sim}(a_1, a_2)
= \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)}
{\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}}
$$
where
$$
\delta(a, p)
= \begin{cases}
1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\
0, \ \text{otherwise}.
\end{cases}
$$
In [ ]:
all_playlist = pkl.load(gzip.open('data/%s/%s-playlist.pkl.gz' % (dataset_name, dataset_name), 'rb'))
In [ ]:
all_artist = sorted(set([song2artist[sid] for pl, _ in all_playlist for sid in pl if sid in song2artist]))
In [ ]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist)}
In [ ]:
Na = len(all_artist)
Np = len(all_playlist)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
pl_artist = sorted(set([song2artist[sid] for sid in all_playlist[j][0] if sid in song2artist]))
ix = [artist2index[aid] for aid in pl_artist]
Delta[ix, j] = 1
In [ ]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A
assert np.all(np.isclose(ColloMat.diagonal(), Dsum))
In [ ]:
print(len(Dsum), len(all_artist))
In [ ]:
#type(ColloMat)
In [ ]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))
WeightMat = np.multiply(ColloMat, NormMat)
In [ ]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []
assert Y_test.shape == PU_test.shape
for j in range(Y_test.shape[1]):
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y1 = Y_test[:, j].toarray().reshape(-1)
y2 = PU_test[:, j].toarray().reshape(-1)
indices = np.where(0 == y2)[0]
y_true = y1[indices]
seeds = [index2song[ix] for ix in np.where(y2 > 0)[0]]
artists = sorted(set([song2artist[sid] for sid in seeds if sid in song2artist]))
artists_ix = [artist2index[aid] for aid in artists]
y_pred = np.zeros(y1.shape)
ix_legal = [ix for ix in indices if index2song[ix] in song2artist]
sid_legal = [index2song[ix] for ix in ix_legal]
aix_legal = [artist2index[song2artist[sid]] for sid in sid_legal]
pop_legal = np.asarray([song2pop_test[sid] for sid in sid_legal])
y_pred[ix_legal] = pop_legal * np.asarray([WeightMat[aix, artists_ix].sum() for aix in aix_legal])
# for ix in ix_legal:
# sid = index2song[ix]
# aix = artist2index[song2artist[sid]]
# pop = song2pop_test[sid]
# y_pred[ix] = pop * WeightMat[aix, artists_ix].sum()
y_pred = y_pred[indices]
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_cagh.append(rp)
for top in TOPs:
hitrates_cagh[top].append(hr_dict[top])
aucs_cagh.append(auc)
print('\n%d / %d' % (len(rps_cagh), PU_test.shape[1]))
In [ ]:
fig = plt.figure(figsize=[20, 5])
ax1 = plt.subplot(131)
ax1.hist(rps_cagh, bins=100)
ax1.set_yscale('log')
ax1.set_title('R-Precision')
#ax.set_xlim(0, xmax)
ax2 = plt.subplot(132)
ax2.hist(aucs_cagh, bins=100)
ax2.set_yscale('log')
ax2.set_title('AUC')
pass
In [ ]:
cagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh),
'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in hitrates_cagh},
'AUC': np.mean(aucs_cagh),}}}
cagh_perf
In [ ]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh-%d.pkl' % n_seed)
print(fperf_cagh)
pkl.dump(cagh_perf, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))
Recommend according to the popularity of songs of artists in listening history.
In [ ]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []
assert Y_test.shape == PU_test.shape
for j in range(Y_test.shape[1]):
if (j+1) % 10 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y1 = Y_test[:, j].toarray().reshape(-1)
y2 = PU_test[:, j].toarray().reshape(-1)
indices = np.where(0 == y2)[0]
y_true = y1[indices]
seeds = [index2song[ix] for ix in np.where(y2 > 0)[0]]
artists = sorted(set([song2artist[sid] for sid in seeds if sid in song2artist]))
y_pred = np.zeros(y1.shape)
candidates = []
for a in artists:
candidates += artist2song[a]
candidates = set(candidates) & set([index2song[ix] for ix in indices])
if len(candidates) > 0:
for sid in candidates:
ix = song2index[sid]
y_pred[ix] = song2pop_test[sid]
y_pred = y_pred[indices]
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_sagh.append(rp)
for top in TOPs:
hitrates_sagh[top].append(hr_dict[top])
aucs_sagh.append(auc)
print('\n%d / %d' % (len(rps_sagh), PU_test.shape[1]))
In [ ]:
fig = plt.figure(figsize=[20, 5])
ax1 = plt.subplot(131)
ax1.hist(rps_sagh, bins=100)
ax1.set_yscale('log')
ax1.set_title('R-Precision')
#ax.set_xlim(0, xmax)
ax2 = plt.subplot(132)
ax2.hist(aucs_sagh, bins=100)
ax2.set_yscale('log')
ax2.set_title('AUC')
pass
In [ ]:
sagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh),
'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in hitrates_sagh},
'AUC': np.mean(aucs_sagh),}}}
sagh_perf
In [ ]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh-%d.pkl' % n_seed)
print(fperf_sagh)
pkl.dump(sagh_perf, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))
In [ ]:
rps_lrpop = []
hitrates_lrpop = {top: [] for top in TOPs}
aucs_lrpop = []
X_train = np.asarray([song2pop_test[sid] for sid, _ in all_songs], dtype=np.float).reshape(len(all_songs), 1)
X_train_mean = np.mean(X_train, axis=0)
X_train_std = np.std(X_train, axis=0) + 1e-6
X_train -= X_train_mean
X_train /= X_train_std
for j in range(Y_test.shape[1]):
if (j+1) % 10 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y1 = Y_test[:, j].toarray().reshape(-1)
y2 = PU_test[:, j].toarray().reshape(-1)
indices = np.where(0 == y2)[0]
y_true = y1[indices]
clf = LogisticRegression()
clf.fit(X_train, y2)
#X_test = np.asarray([song2pop[index2song[ix]] for ix in indices]).reshape(len(indices), 1)
X_test = X_train
y_pred = clf.decision_function(X_test)[indices]
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_lrpop.append(rp)
for top in TOPs:
hitrates_lrpop[top].append(hr_dict[top])
aucs_lrpop.append(auc)
print('\n%d / %d' % (len(rps_lrpop), Y_test.shape[1]))
In [ ]:
fig = plt.figure(figsize=[20, 5])
ax1 = plt.subplot(131)
ax1.hist(rps_lrpop, bins=100)
ax1.set_yscale('log')
ax1.set_title('R-Precision')
#ax.set_xlim(0, xmax)
ax2 = plt.subplot(132)
ax2.hist(aucs_lrpop, bins=100)
ax2.set_yscale('log')
ax2.set_title('AUC')
pass
In [ ]:
lrpop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_lrpop),
'Hit-Rate': {top: np.mean(hitrates_lrpop[top]) for top in hitrates_lrpop},
'AUC': np.mean(aucs_lrpop),}}}
lrpop_perf
In [ ]:
fperf_lrpop = os.path.join(data_dir, 'perf-lrpop-%d.pkl' % n_seed)
print(fperf_lrpop)
pkl.dump(lrpop_perf, open(fperf_lrpop, 'wb'))
pkl.load(open(fperf_lrpop, 'rb'))
In [ ]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []
assert Y_test.shape == PU_test.shape
for j in range(Y_test.shape[1]):
if (j+1) % 10 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y1 = Y_test[:, j].toarray().reshape(-1)
y2 = PU_test[:, j].toarray().reshape(-1)
indices = np.where(0 == y2)[0]
y_true = y1[indices]
y_pred = np.array([song2pop_test[index2song[ix]] for ix in indices])
# rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_pop.append(rp)
for top in TOPs:
hitrates_pop[top].append(hr_dict[top])
aucs_pop.append(auc)
print('\n%d / %d' % (len(rps_pop), PU_test.shape[1]))
In [ ]:
fig = plt.figure(figsize=[20, 5])
ax1 = plt.subplot(131)
ax1.hist(rps_pop, bins=100)
ax1.set_yscale('log')
ax1.set_title('R-Precision')
#ax.set_xlim(0, xmax)
ax2 = plt.subplot(132)
ax2.hist(aucs_pop, bins=100)
ax2.set_yscale('log')
ax2.set_title('AUC')
pass
In [ ]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop),
'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in hitrates_pop},
'AUC': np.mean(aucs_pop),}}}
pop_perf
In [ ]:
fperf_pop = os.path.join(data_dir, 'perf-pop-%d.pkl' % n_seed)
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))