In [ ]:
%matplotlib inline
import os
import sys
import gzip
import numpy as np
import pickle as pkl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
plt.style.use('seaborn')
In [ ]:
datasets = [('30music', '30Music'), ('aotm2011', 'AotM-2011')]
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]
In [ ]:
metrics = [('Hit-Rate', 'HitRate@100'), ('AUC', 'AUC')]
In [ ]:
task = 1
base_dir = 'setting%d' % task
In [ ]:
algos = [('nsr', 'Multitask Classification'),
('br1', 'Logistic Regression'),
('pop', 'Popularity Ranking'),
('cagh', 'CAGH'), ('sagh', 'SAGH')]
In [ ]:
metric = metrics[0]
In [ ]:
nrows, ncols = len(algos), len(datasets)
colors = ["#2ecc71", "#9b59b6", "#3498db", "#34495e", "#ff1006", "#e74c3c"]
fig = plt.figure(figsize=[10, 20])
for j in range(len(datasets)):
dataset = datasets[j]
data_dir = 'data/%s/%s' % (dataset[0], base_dir)
cliques = pkl.load(gzip.open(os.path.join(data_dir, 'cliques_trndev.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))
fperfs = [os.path.join(data_dir, 'perf-%s.pkl' % algo) for algo, _ in algos]
perf_dicts = [pkl.load(open(fperf, 'rb')) if os.path.exists(fperf) else None for fperf in fperfs]
npos = Y_test.sum(axis=0).A.reshape(-1)
nz_ix = sorted(np.nonzero(npos)[0].tolist())
# print(nz_ix)
U = len(cliques)
u2pl = dict()
pl2u = np.zeros(Y_test.shape[1], dtype=np.int)
for u in range(U):
clq = cliques[u]
u2pl[u] = clq
pl2u[clq] = u
u2perf_dicts = []
for i in range(len(perf_dicts)):
perf = perf_dicts[i]
assert len(perf[dataset[0]]['Test_All']['AUC']) == len(nz_ix)
u2perf = dict()
for k in range(len(nz_ix)):
u = pl2u[nz_ix[k]]
auc = perf[dataset[0]]['Test_All']['AUC'][k] \
if metric[0] == 'AUC' else perf[dataset[0]]['Test_All']['Hit-Rate'][100][k]
try:
u2perf[u].append(auc)
except KeyError:
u2perf[u] = [auc]
u2perf_dicts.append(u2perf)
npl_user = [len(u2pl[u]) for u in sorted(u2perf)]
mean_auc = [np.mean(u2perf[u]) for u in sorted(u2perf)]
ax = plt.subplot(nrows, ncols, i * len(datasets) + j + 1)
ax.scatter(npl_user, mean_auc, color=colors[i], alpha=0.5, s=20)
lim = [-0.03, 1.03]
ax.set_ylim(lim)
if i == len(algos) - 1:
ax.set_xlabel('#Playlists per User for Training')
if j == 0:
ax.set_ylabel('Mean %s per User' % metric[1])
ax.set_title('%s (%s)' % (algos[i][1], dataset[1]))
plt.savefig('%s_per_user%d.svg' % (metric[0].replace('-', '').lower(), 0 if task == 1 else task-2))
In [ ]:
task = 3
base_dir = 'setting%d' % task
In [ ]:
algos = [('mtc', 'Multitask Classification'),
('pop', 'Popularity Ranking'),
('cagh', 'CAGH'), ('sagh', 'SAGH')]
In [ ]:
metric = metrics[0]
In [ ]:
nrows, ncols = len(algos), len(datasets)
colors = ["#2ecc71", "#9b59b6", "#3498db", "#34495e", "#ff1006", "#e74c3c"]
fig = plt.figure(figsize=[10, 20])
for j in range(len(datasets)):
dataset = datasets[j]
data_dir = 'data/%s/%s' % (dataset[0], base_dir)
Y_train = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))
cliques_train = pkl.load(gzip.open(os.path.join(data_dir, 'cliques_train.pkl.gz'), 'rb'))
cliques_all = pkl.load(gzip.open(os.path.join(data_dir, 'cliques_all.pkl.gz'), 'rb'))
fperfs = [os.path.join(data_dir, 'perf-%s.pkl' % algo) for algo, _ in algos]
perf_dicts = [pkl.load(open(fperf, 'rb')) if os.path.exists(fperf) else None for fperf in fperfs]
pl2u_train = np.zeros(Y_train.shape[1], dtype=np.int)
pl2u_all = np.zeros(Y_train.shape[1] + Y_test.shape[1], dtype=np.int)
U = len(cliques_all)
assert U == len(cliques_train)
for u in range(U):
pl2u_train[cliques_train[u]] = u
pl2u_all[cliques_all[u]] = u
assert np.all(pl2u_train == pl2u_all[:Y_train.shape[1]])
u2perf_dicts = []
offset = Y_train.shape[1]
for i in range(len(perf_dicts)):
perf = perf_dicts[i]
assert len(perf[dataset[0]]['Test_All']['AUC']) == Y_test.shape[1]
u2perf = dict()
for k in range(Y_test.shape[1]):
u = pl2u_all[k + offset]
num = perf[dataset[0]]['Test_All']['AUC'][k] \
if metric[0] == 'AUC' else perf[dataset[0]]['Test_All']['Hit-Rate'][100][k]
try:
u2perf[u].append(num)
except KeyError:
u2perf[u] = [auc]
u2perf_dicts.append(u2perf)
npl_user = [len(cliques_train[u]) for u in sorted(u2perf)]
mean_num = [np.mean(u2perf[u]) for u in sorted(u2perf)]
ax = plt.subplot(nrows, ncols, i * len(datasets) + j + 1)
ax.scatter(npl_user, mean_num, color=colors[i], alpha=0.5, s=20)
lim = [-0.03, 1.03]
ax.set_ylim(lim)
if i == len(algos) - 1:
ax.set_xlabel('#Playlists per User for Training')
if j == 0:
ax.set_ylabel('Mean %s per User' % metric[1])
ax.set_title('%s (%s)' % (algos[i][1], dataset[1]))
plt.savefig('%s_per_user%d.svg' % (metric[0].replace('-', '').lower(), 0 if task == 1 else task-2))