Playlist augmentation baselines


In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys, time, gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from scipy.sparse import lil_matrix, issparse

import matplotlib.pyplot as plt
import seaborn as sns

In [ ]:
from models import MTC
from sklearn.linear_model import LogisticRegression
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics

In [ ]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 1000]

In [ ]:
datasets = ['aotm2011', '30music']

In [ ]:
dix = 0
dataset_name = datasets[dix]
dataset_name

In [ ]:
# X = pkl.load(gzip.open(os.path.join(base_dir, 'X_train_dev.pkl.gz'), 'rb'))
# Y = pkl.load(gzip.open(os.path.join(base_dir, 'Y.pkl.gz'), 'rb'))
# PU_test = pkl.load(gzip.open(os.path.join(base_dir, 'PU_test.pkl.gz'), 'rb'))
# playlists2 = pkl.load(gzip.open(os.path.join(base_dir, 'playlists_train_dev_test_s2.pkl.gz'), 'rb'))

In [ ]:
n_seed = 1
data_dir = 'data/%s/setting2' % dataset_name
X = pkl.load(gzip.open(os.path.join(data_dir, 'X_trndev_%d.pkl.gz' % n_seed), 'rb'))
Y = pkl.load(gzip.open(os.path.join(data_dir, 'Y.pkl.gz'), 'rb'))
PU_test = pkl.load(gzip.open(os.path.join(data_dir, 'PU_test_%d.pkl.gz' % n_seed), 'rb'))
song2pop = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop.pkl.gz'), 'rb'))
playlists2 = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_train_dev_test_s2_%d.pkl.gz' % n_seed), 'rb'))

In [ ]:
Y_test = Y[:, -PU_test.shape[1]:]
print(Y_test.shape)
#Y_test.sum(axis=0)

In [ ]:
all_songs = pkl.load(gzip.open(os.path.join(data_dir, 'all_songs.pkl.gz'), 'rb'))
index2song = {ix: sid for ix, (sid, _) in enumerate(all_songs)}

In [ ]:
song2index = {sid: ix for ix, (sid, _) in enumerate(all_songs)}

In [ ]:
song2pop_test = song2pop.copy()

for ppl in playlists2['test_playlists_held']:
    for sid in ppl:
        song2pop_test[sid] -= 1

In [ ]:
X_train = np.asarray([song2pop[sid] for sid, _ in all_songs], dtype=np.float).reshape(len(all_songs), 1)
X_train_mean = np.mean(X_train, axis=0)
X_train_std = np.std(X_train, axis=0) + 1e-6
X_train -= X_train_mean
X_train /= X_train_std

pkl.dump(X_train, gzip.open(os.path.join(data_dir, 'X_trndev_pop_%d.pkl.gz' % n_seed), 'wb'))

In [ ]:
song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))

In [ ]:
artist2song = dict()

for sid in sorted(song2artist):
    artist = song2artist[sid]
    try:
        artist2song[artist].append(sid)
    except KeyError:
        artist2song[artist] = [sid]

In [ ]:
print('{:,} | {:,}'.format(len(song2artist), len(artist2song)))

Collocated Artists - Greatest Hits (CAGH)

Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:
$$ \text{sim}(a_1, a_2) = \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)} {\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}} $$ where $$ \delta(a, p) = \begin{cases} 1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\ 0, \ \text{otherwise}. \end{cases} $$


In [ ]:
all_playlist = pkl.load(gzip.open('data/%s/%s-playlist.pkl.gz' % (dataset_name, dataset_name), 'rb'))

In [ ]:
all_artist = sorted(set([song2artist[sid] for pl, _ in all_playlist for sid in pl if sid in song2artist]))

In [ ]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist)}

In [ ]:
Na = len(all_artist)
Np = len(all_playlist)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
    pl_artist = sorted(set([song2artist[sid] for sid in all_playlist[j][0] if sid in song2artist]))
    ix = [artist2index[aid] for aid in pl_artist]
    Delta[ix, j] = 1

In [ ]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A

assert np.all(np.isclose(ColloMat.diagonal(), Dsum))

In [ ]:
print(len(Dsum), len(all_artist))

In [ ]:
#type(ColloMat)

In [ ]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))

WeightMat = np.multiply(ColloMat, NormMat)

In [ ]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []

assert Y_test.shape == PU_test.shape
for j in range(Y_test.shape[1]):
    sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
    sys.stdout.flush()
    y1 = Y_test[:, j].toarray().reshape(-1)
    y2 = PU_test[:, j].toarray().reshape(-1)
    indices = np.where(0 == y2)[0]
    y_true = y1[indices]
    
    seeds = [index2song[ix] for ix in np.where(y2 > 0)[0]]
    artists = sorted(set([song2artist[sid] for sid in seeds if sid in song2artist]))
    artists_ix = [artist2index[aid] for aid in artists]
    
    y_pred = np.zeros(y1.shape)
    ix_legal = [ix for ix in indices if index2song[ix] in song2artist]
    sid_legal = [index2song[ix] for ix in ix_legal]
    aix_legal = [artist2index[song2artist[sid]] for sid in sid_legal]
    pop_legal = np.asarray([song2pop_test[sid] for sid in sid_legal])
    y_pred[ix_legal] = pop_legal * np.asarray([WeightMat[aix, artists_ix].sum() for aix in aix_legal])
    
#     for ix in ix_legal:
#         sid = index2song[ix]
#         aix = artist2index[song2artist[sid]]
#         pop = song2pop_test[sid]
#         y_pred[ix] = pop * WeightMat[aix, artists_ix].sum()
    
    y_pred = y_pred[indices]
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_cagh.append(rp)
    for top in TOPs:
        hitrates_cagh[top].append(hr_dict[top])
    aucs_cagh.append(auc)
    
print('\n%d / %d' % (len(rps_cagh), PU_test.shape[1]))

In [ ]:
fig = plt.figure(figsize=[20, 5])
ax1 = plt.subplot(131)
ax1.hist(rps_cagh, bins=100)
ax1.set_yscale('log')
ax1.set_title('R-Precision')
#ax.set_xlim(0, xmax)
ax2 = plt.subplot(132)
ax2.hist(aucs_cagh, bins=100)
ax2.set_yscale('log')
ax2.set_title('AUC')
pass

In [ ]:
cagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in hitrates_cagh},
                                     'AUC': np.mean(aucs_cagh),}}}
cagh_perf

In [ ]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh-%d.pkl' % n_seed)
print(fperf_cagh)
pkl.dump(cagh_perf, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))

Same Artists - Greatest Hits (SAGH)

Recommend according to the popularity of songs of artists in listening history.


In [ ]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []

assert Y_test.shape == PU_test.shape
for j in range(Y_test.shape[1]):
    if (j+1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y1 = Y_test[:, j].toarray().reshape(-1)
    y2 = PU_test[:, j].toarray().reshape(-1)
    indices = np.where(0 == y2)[0]
    y_true = y1[indices]
    
    seeds = [index2song[ix] for ix in np.where(y2 > 0)[0]]
    artists = sorted(set([song2artist[sid] for sid in seeds if sid in song2artist]))
    y_pred = np.zeros(y1.shape)
    candidates = []
    for a in artists:
        candidates += artist2song[a]
    candidates = set(candidates) & set([index2song[ix] for ix in indices])
    
    if len(candidates) > 0:
        for sid in candidates:
            ix = song2index[sid]
            y_pred[ix] = song2pop_test[sid]

    y_pred = y_pred[indices]
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_sagh.append(rp)
    for top in TOPs:
        hitrates_sagh[top].append(hr_dict[top])
    aucs_sagh.append(auc)
    
print('\n%d / %d' % (len(rps_sagh), PU_test.shape[1]))

In [ ]:
fig = plt.figure(figsize=[20, 5])
ax1 = plt.subplot(131)
ax1.hist(rps_sagh, bins=100)
ax1.set_yscale('log')
ax1.set_title('R-Precision')
#ax.set_xlim(0, xmax)
ax2 = plt.subplot(132)
ax2.hist(aucs_sagh, bins=100)
ax2.set_yscale('log')
ax2.set_title('AUC')
pass

In [ ]:
sagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in hitrates_sagh},
                                     'AUC': np.mean(aucs_sagh),}}}
sagh_perf

In [ ]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh-%d.pkl' % n_seed)
print(fperf_sagh)
pkl.dump(sagh_perf, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))

Logistic Regression using song popularity as the only feature


In [ ]:
rps_lrpop = []
hitrates_lrpop = {top: [] for top in TOPs}
aucs_lrpop = []

X_train = np.asarray([song2pop_test[sid] for sid, _ in all_songs], dtype=np.float).reshape(len(all_songs), 1)
X_train_mean = np.mean(X_train, axis=0)
X_train_std = np.std(X_train, axis=0) + 1e-6
X_train -= X_train_mean
X_train /= X_train_std
for j in range(Y_test.shape[1]):
    if (j+1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y1 = Y_test[:, j].toarray().reshape(-1)
    y2 = PU_test[:, j].toarray().reshape(-1)
    indices = np.where(0 == y2)[0]
    y_true = y1[indices]
    
    clf = LogisticRegression()
    clf.fit(X_train, y2)
    #X_test = np.asarray([song2pop[index2song[ix]] for ix in indices]).reshape(len(indices), 1)
    X_test = X_train
    y_pred = clf.decision_function(X_test)[indices]
    
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_lrpop.append(rp)
    for top in TOPs:
        hitrates_lrpop[top].append(hr_dict[top])
    aucs_lrpop.append(auc)
    
print('\n%d / %d' % (len(rps_lrpop), Y_test.shape[1]))

In [ ]:
fig = plt.figure(figsize=[20, 5])
ax1 = plt.subplot(131)
ax1.hist(rps_lrpop, bins=100)
ax1.set_yscale('log')
ax1.set_title('R-Precision')
#ax.set_xlim(0, xmax)
ax2 = plt.subplot(132)
ax2.hist(aucs_lrpop, bins=100)
ax2.set_yscale('log')
ax2.set_title('AUC')
pass

In [ ]:
lrpop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_lrpop), 
                                      'Hit-Rate': {top: np.mean(hitrates_lrpop[top]) for top in hitrates_lrpop},
                                      'AUC': np.mean(aucs_lrpop),}}}
lrpop_perf

In [ ]:
fperf_lrpop = os.path.join(data_dir, 'perf-lrpop-%d.pkl' % n_seed)
print(fperf_lrpop)
pkl.dump(lrpop_perf, open(fperf_lrpop, 'wb'))
pkl.load(open(fperf_lrpop, 'rb'))

Popularity based recommendation


In [ ]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []

assert Y_test.shape == PU_test.shape
for j in range(Y_test.shape[1]):
    if (j+1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y1 = Y_test[:, j].toarray().reshape(-1)
    y2 = PU_test[:, j].toarray().reshape(-1)
    indices = np.where(0 == y2)[0]
    y_true = y1[indices]
    
    y_pred = np.array([song2pop_test[index2song[ix]] for ix in indices])
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_pop.append(rp)
    for top in TOPs:
        hitrates_pop[top].append(hr_dict[top])
    aucs_pop.append(auc)
    
print('\n%d / %d' % (len(rps_pop), PU_test.shape[1]))

In [ ]:
fig = plt.figure(figsize=[20, 5])
ax1 = plt.subplot(131)
ax1.hist(rps_pop, bins=100)
ax1.set_yscale('log')
ax1.set_title('R-Precision')
#ax.set_xlim(0, xmax)
ax2 = plt.subplot(132)
ax2.hist(aucs_pop, bins=100)
ax2.set_yscale('log')
ax2.set_title('AUC')
pass

In [ ]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop), 
                                    'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in hitrates_pop},
                                    'AUC': np.mean(aucs_pop),}}}
pop_perf

In [ ]:
fperf_pop = os.path.join(data_dir, 'perf-pop-%d.pkl' % n_seed)
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))