In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, time, gzip
import pickle as pkl
import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix, issparse, hstack, vstack
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
from models import MTC
from sklearn.linear_model import LogisticRegression
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics
In [ ]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]
In [ ]:
datasets = ['aotm2011', '30music']
In [ ]:
dix = 1
dataset_name = datasets[dix]
dataset_name
In [ ]:
data_dir = 'data/%s/setting1' % dataset_name
Y_trndev = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train_dev.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))
song2pop = pkl.load(gzip.open('data/%s/setting2/song2pop.pkl.gz' % dataset_name, 'rb'))
songsets = pkl.load(gzip.open(os.path.join(data_dir, 'songs_train_dev_test_s1.pkl.gz'), 'rb'))
In [ ]:
songset_trndev = songsets['train_song_set'] + songsets['dev_song_set']
songset_test = songsets['test_song_set']
Random tie breaking if there are more than one longest playlist.
In [ ]:
pl_indices = np.where(Y_test.sum(axis=0).A.reshape(-1) > 0)[0]
lengths = Y_trndev.sum(axis=0).A.reshape(-1)[pl_indices]
Y_pred = lil_matrix(Y_test.shape, dtype=np.float)
np.random.seed(1234567890)
for ix in range(len(songset_test)):
sort_ix = np.argsort(-lengths)
long_ix = [sort_ix[0]]
longest = lengths[sort_ix[0]]
for i in range(1, sort_ix.shape[0]):
if lengths[sort_ix[i]] < longest:
break
else:
short_ix.append(sort_ix[i])
long_ix = np.random.permutation(long_ix)
rec_ix = long_ix[0]
Y_pred[ix, pl_indices[rec_ix]] = 1
lengths[rec_ix] += 1
Y_pred = Y_pred.tocsc()
In [ ]:
rps_longest = []
hitrates_longest = {top: [] for top in TOPs}
aucs_longest = []
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y_true = Y_test[:, j].toarray().reshape(-1)
if y_true.sum() < 1:
continue
y_pred = Y_pred[:, j].A.reshape(-1)
rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
rps_longest.append(rp)
for top in TOPs:
hitrates_longest[top].append(hr_dict[top])
aucs_longest.append(auc)
print('\n%d / %d' % (len(rps_longest), Y_test.shape[1]))
In [ ]:
longest_perf = {dataset_name:
{'Test': {'R-Precision': np.mean(rps_longest),
'Hit-Rate': {top: np.mean(hitrates_longest[top]) for top in hitrates_longest},
'AUC': np.mean(aucs_longest)}}}
longest_perf
In [ ]:
fperf_longest = os.path.join(data_dir, 'perf-longest.pkl')
print(fperf_longest)
pkl.dump(longest_perf, open(fperf_longest, 'wb'))
pkl.load(open(fperf_longest, 'rb'))
Random tie breaking if there are more than one shortest playlist.
In [ ]:
pl_indices = np.where(Y_test.sum(axis=0).A.reshape(-1) > 0)[0]
lengths = Y_trndev.sum(axis=0).A.reshape(-1)[pl_indices]
Y_pred = lil_matrix(Y_test.shape, dtype=np.float)
np.random.seed(1234567890)
for ix in range(len(songset_test)):
sort_ix = np.argsort(lengths)
short_ix = [sort_ix[0]]
shortest = lengths[sort_ix[0]]
for i in range(1, sort_ix.shape[0]):
if lengths[sort_ix[i]] > shortest:
break
else:
short_ix.append(sort_ix[i])
short_ix = np.random.permutation(short_ix)
rec_ix = short_ix[0]
Y_pred[ix, pl_indices[rec_ix]] = 1
lengths[rec_ix] += 1
Y_pred = Y_pred.tocsc()
In [ ]:
rps_shortest = []
hitrates_shortest = {top: [] for top in TOPs}
aucs_shortest = []
ndcgs_shortest = []
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y_true = Y_test[:, j].toarray().reshape(-1)
if y_true.sum() < 1:
continue
y_pred = Y_pred[:, j].A.reshape(-1)
rp, hr_dict, auc, ndcg = calc_metrics(y_true, y_pred, tops=TOPs)
rps_shortest.append(rp)
for top in TOPs:
hitrates_shortest[top].append(hr_dict[top])
aucs_shortest.append(auc)
ndcgs_shortest.append(ndcg)
print('\n%d / %d' % (len(rps_shortest), Y_test.shape[1]))
In [ ]:
shortest_perf = {dataset_name:
{'Test': {'R-Precision': np.mean(rps_shortest),
'Hit-Rate': {top: np.mean(hitrates_shortest[top]) for top in hitrates_shortest},
'AUC': np.mean(aucs_shortest),
'NDCG': np.mean(ndcgs_shortest)}}}
shortest_perf
In [ ]:
fperf_shortest = os.path.join(data_dir, 'perf-shortest.pkl')
print(fperf_shortest)
pkl.dump(shortest_perf, open(fperf_shortest, 'wb'))
pkl.load(open(fperf_shortest, 'rb'))
In [ ]:
rps_poptest = []
hitrates_poptest = {top: [] for top in TOPs}
aucs_poptest = []
ndcgs_poptest = []
for j in range(Y_test.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y_true = Y_test[:, j].toarray().reshape(-1)
if y_true.sum() < 1:
continue
y_pred = np.asarray([song2pop[sid] for sid, _ in songset_test])
rp, hr_dict, auc, ndcg = calc_metrics(y_true, y_pred, tops=TOPs)
rps_poptest.append(rp)
for top in TOPs:
hitrates_poptest[top].append(hr_dict[top])
aucs_poptest.append(auc)
ndcgs_poptest.append(ndcg)
print('\n%d / %d' % (len(rps_poptest), Y_test.shape[1]))
In [ ]:
fig = plt.figure(figsize=[20, 5])
ax1 = plt.subplot(131)
ax1.hist(rps_poptest, bins=100)
ax1.set_yscale('log')
ax1.set_title('R-Precision')
#ax.set_xlim(0, xmax)
ax2 = plt.subplot(132)
ax2.hist(aucs_poptest, bins=100)
ax2.set_yscale('log')
ax2.set_title('AUC')
ax3 = plt.subplot(133)
ax3.hist(ndcgs_poptest, bins=100)
ax3.set_yscale('log')
ax3.set_title('NDCG')
pass
In [ ]:
poptest_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_poptest),
'Hit-Rate': {top: np.mean(hitrates_poptest[top]) for top in hitrates_poptest},
'AUC': np.mean(aucs_poptest),
'NDCG': np.mean(ndcgs_poptest)}}}
poptest_perf
In [ ]:
fperf_poptest = os.path.join(data_dir, 'perf-poptest.pkl')
print(fperf_poptest)
pkl.dump(poptest_perf, open(fperf_poptest, 'wb'))
pkl.load(open(fperf_poptest, 'rb'))
In [ ]:
rps_lrpop = []
hitrates_lrpop = {top: [] for top in TOPs}
aucs_lrpop = []
ndcgs_lrpop = []
nsong_trndev = len(songset_trndev)
nsong_test = len(songset_test)
for j in range(Y_test.shape[1]):
if (j+1) % 10 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
sys.stdout.flush()
y_true = Y_test[:, j].toarray().reshape(-1)
if y_true.sum() < 1:
continue
X_train = np.asarray([song2pop[sid] for sid, _ in songset_trndev]).reshape(nsong_trndev, 1)
Y_train = Y_trndev[:, j].A.reshape(-1)
clf = LogisticRegression()
clf.fit(X_train, Y_train)
X_test = np.asarray([song2pop[sid] for sid, _ in songset_test]).reshape(nsong_test, 1)
y_pred = clf.decision_function(X_test).reshape(-1)
rp, hr_dict, auc, ndcg = calc_metrics(y_true, y_pred, tops=TOPs)
rps_lrpop.append(rp)
for top in TOPs:
hitrates_lrpop[top].append(hr_dict[top])
aucs_lrpop.append(auc)
ndcgs_lrpop.append(ndcg)
print('\n%d / %d' % (len(rps_lrpop), Y_test.shape[1]))
In [ ]:
lrpop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_lrpop),
'Hit-Rate': {top: np.mean(hitrates_lrpop[top]) for top in hitrates_lrpop},
'AUC': np.mean(aucs_lrpop),
'NDCG': np.mean(ndcgs_lrpop)}}}
lrpop_perf
In [ ]:
fperf_lrpop = os.path.join(data_dir, 'perf-lrpop.pkl')
print(fperf_lrpop)
pkl.dump(lrpop_perf, open(fperf_lrpop, 'wb'))
pkl.load(open(fperf_lrpop, 'rb'))