Baselines - playlist generation for known users


In [1]:
%matplotlib inline

import os, sys, time, gzip
import pickle as pkl
import numpy as np
from scipy.sparse import lil_matrix, issparse, csc_matrix, csr_matrix
import torch

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics, diversity, softmax
# from tools import calc_Precision_Recall

In [3]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 700, 1000]

In [4]:
datasets = ['aotm2011', '30music']

In [5]:
dix = 1
dataset_name = datasets[dix]
dataset_name


Out[5]:
'30music'

In [6]:
data_dir = 'data/%s/coldstart/setting3' % dataset_name
X = pkl.load(gzip.open(os.path.join(data_dir, 'X.pkl.gz'), 'rb'))
Y_train = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))
song2pop_train = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop_train.pkl.gz'), 'rb'))

In [7]:
playlists3 = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_train_test_s3.pkl.gz'), 'rb'))
train_playlists = playlists3['train_playlists']
test_playlists = playlists3['test_playlists']
user2songs = dict()

for pl, u in train_playlists:
    try:
        user2songs[u].update(set(pl))
    except KeyError:
        user2songs[u] = set(pl)

In [8]:
all_songs = pkl.load(gzip.open(os.path.join(data_dir, 'all_songs.pkl.gz'), 'rb'))
index2song = {ix: sid for ix, (sid, _) in enumerate(all_songs)}

In [9]:
song2index = {sid: ix for ix, (sid, _) in enumerate(all_songs)}

In [10]:
_song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
song2artist = {sid: _song2artist[sid] for sid, _ in all_songs if sid in _song2artist}

In [11]:
artist2songs = dict()

for sid in sorted(song2artist):
    artist = song2artist[sid]
    try:
        artist2songs[artist].append(sid)
    except KeyError:
        artist2songs[artist] = [sid]

In [12]:
print('{:,} | {:,}'.format(len(song2artist), len(artist2songs)))


45,468 | 9,981

In [13]:
song2genre = pkl.load(gzip.open('data/msd/song2genre.pkl.gz', 'rb'))

In [14]:
song2pop = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop.pkl.gz'), 'rb'))

Collocated Artists - Greatest Hits (CAGH)

Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:
$$ \text{sim}(a_1, a_2) = \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)} {\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}} $$ where $$ \delta(a, p) = \begin{cases} 1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\ 0, \ \text{otherwise}. \end{cases} $$

Recommend according to the popularity of songs, but weighted by similarity of (artist in user's listening history, artist of song).


In [15]:
all_artist = sorted(set([song2artist[sid] for pl, _ in train_playlists for sid in pl if sid in song2artist]))

In [16]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist)}

In [17]:
Na = len(all_artist)
Np = len(train_playlists)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
    pl_artist = sorted(set([song2artist[sid] for sid in train_playlists[j][0] if sid in song2artist]))
    ix = [artist2index[aid] for aid in pl_artist]
    Delta[ix, j] = 1

In [18]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A

assert np.all(np.isclose(ColloMat.diagonal(), Dsum))

In [19]:
print(len(Dsum), len(all_artist))


9981 9981

In [20]:
#type(ColloMat)

In [21]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))

WeightMat = np.multiply(ColloMat, NormMat)

In [22]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []
spreads_cagh = []
novelties_cagh = {top: dict() for top in TOPs}
ptops_cagh = []
# artist_diversities_cagh = {top: [] for top in TOPs}
# genre_diversities_cagh = {top: [] for top in TOPs}
np.random.seed(0)

assert Y_test.shape[1] == len(test_playlists)

sid_legal = [sid for sid, _ in all_songs if sid in song2artist]
aix_legal = [artist2index[song2artist[sid]] for sid in sid_legal]
pop_legal = np.asarray([song2pop_train[sid] for sid in sid_legal])
ix_legal = [song2index[sid] for sid in sid_legal]

prev_u = None
prev_y = None

for j in range(Y_test.shape[1]):
    sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
    sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    
    u = test_playlists[j][1]
    if prev_u is None or prev_u != u:
        artists = sorted(set([song2artist[sid] for sid in user2songs[u] if sid in song2artist]))
        artists_ix = [artist2index[aid] for aid in artists]
        y_pred = np.zeros(y_true.shape)
        y_pred[ix_legal] = np.log(pop_legal) * np.asarray([WeightMat[aix, artists_ix].sum() for aix in aix_legal])

        # for ix in ix_legal:
        #     sid = index2song[ix]
        #     aix = artist2index[song2artist[sid]]
        #     pop = song2pop_test[sid]
        #     y_pred[ix] = pop * WeightMat[aix, artists_ix].sum()
        
        prev_u = u
        prev_y = y_pred
    else:
        y_pred = prev_y

    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_cagh.append(rp)
    for top in TOPs:
        hitrates_cagh[top].append(hr_dict[top])
    aucs_cagh.append(auc)
    
    # spread
    y_pred_prob = softmax(y_pred)
    spreads_cagh.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))

    # novelty
    sortix = np.argsort(-y_pred)
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
        try:
            novelties_cagh[top][u].append(nov)
        except KeyError:
            novelties_cagh[top][u] = [nov]
    
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    npos = y_true.sum()
    assert npos > 0
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos
    ptops_cagh.append(pt)
    
    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_cagh[top].append( diversity(artist_vec) )
#         genre_diversities_cagh[top].append( diversity(genre_vec) )

print('\n%d / %d' % (len(rps_cagh), Y_test.shape[1]))


2195 / 2195
2195 / 2195

In [23]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_cagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_cagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [24]:
cagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in TOPs},
                                     'AUC': np.mean(aucs_cagh),
                                     'Spread': np.mean(spreads_cagh),
                                     'Novelty': {t: np.mean([np.mean(novelties_cagh[t][u]) 
                                                             for u in novelties_cagh[t]]) for t in TOPs},
                                     'PTop': np.mean(ptops_cagh),
                                     # 'Artist-Diversity': {t: np.mean(artist_diversities_cagh[t]) for t in TOPs},
                                     # 'Genre-Diversity': {t: np.mean(genre_diversities_cagh[t]) for t in TOPs}},
                                    },
                            'Test_All': {'R-Precision': rps_cagh, 
                                         'Hit-Rate': {top: hitrates_cagh[top] for top in TOPs},
                                         'AUC': aucs_cagh,
                                         'Spread': spreads_cagh,
                                         'Novelty': novelties_cagh,
                                         'PTop': ptops_cagh,
                                         # 'Artist-Diversity': artist_diversities_cagh,
                                         # 'Genre-Diversity': genre_diversities_cagh}}}
                                        }}}
cagh_perf[dataset_name]['Test']


Out[24]:
{'R-Precision': 0.04319798205721797,
 'Hit-Rate': {5: 0.02521471715758077,
  10: 0.043921921531783446,
  20: 0.07366269688790775,
  30: 0.09581443548438519,
  50: 0.13063921817169946,
  100: 0.20099355056502818,
  200: 0.2939449086419079,
  300: 0.3547684460173996,
  500: 0.44579655354978615,
  700: 0.5075424610930664,
  1000: 0.576546531767024},
 'AUC': 0.9483727023751518,
 'Spread': 5.790900154856364,
 'Novelty': {5: -7.058046504828398,
  10: -6.725432177526543,
  20: -6.3156666454542805,
  30: -6.0401671802802674,
  50: -5.740252835295074,
  100: -5.5421073371922995,
  200: -5.457428058333383,
  300: -5.390313360005151,
  500: -5.266346105989974,
  700: -5.154919662139392,
  1000: -5.008269281341808},
 'PTop': 0.009496450024095335}

In [25]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh.pkl')
print(fperf_cagh)
pkl.dump(cagh_perf, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting3/perf-cagh.pkl
Out[25]:
{'R-Precision': 0.04319798205721797,
 'Hit-Rate': {5: 0.02521471715758077,
  10: 0.043921921531783446,
  20: 0.07366269688790775,
  30: 0.09581443548438519,
  50: 0.13063921817169946,
  100: 0.20099355056502818,
  200: 0.2939449086419079,
  300: 0.3547684460173996,
  500: 0.44579655354978615,
  700: 0.5075424610930664,
  1000: 0.576546531767024},
 'AUC': 0.9483727023751518,
 'Spread': 5.790900154856364,
 'Novelty': {5: -7.058046504828398,
  10: -6.725432177526543,
  20: -6.3156666454542805,
  30: -6.0401671802802674,
  50: -5.740252835295074,
  100: -5.5421073371922995,
  200: -5.457428058333383,
  300: -5.390313360005151,
  500: -5.266346105989974,
  700: -5.154919662139392,
  1000: -5.008269281341808},
 'PTop': 0.009496450024095335}

Same Artists - Greatest Hits (SAGH)

Recommend according to the popularity of songs of artists in listening history.


In [15]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []
spreads_sagh = []
novelties_sagh = {top: dict() for top in TOPs}
ptops_sagh = []
# artist_diversities_sagh = {top: [] for top in TOPs}
# genre_diversities_sagh = {top: [] for top in TOPs}
np.random.seed(0)

assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    y_pred = np.zeros(y_true.shape)
    
    u = test_playlists[j][1]
    artists = sorted(set([song2artist[sid] for sid in user2songs[u] if sid in song2artist]))
    candidates = []
    for a in artists:
        candidates += artist2songs[a]
    candidates = sorted(set(candidates))
    if len(candidates) > 0:
        for sid in candidates:
            ix = song2index[sid]
            y_pred[ix] = np.log(song2pop_train[sid])

    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_sagh.append(rp)
    for top in TOPs:
        hitrates_sagh[top].append(hr_dict[top])
    aucs_sagh.append(auc)
    
    # spread
    y_pred_prob = softmax(y_pred)
    spreads_sagh.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))

    # novelty
    sortix = np.argsort(-y_pred)
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
        try:
            novelties_sagh[top][u].append(nov)
        except KeyError:
            novelties_sagh[top][u] = [nov]
            
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    npos = y_true.sum()
    assert npos > 0
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos
    ptops_sagh.append(pt)
    
    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_sagh[top].append( diversity(artist_vec) )
#         genre_diversities_sagh[top].append( diversity(genre_vec) )
print('\n%d / %d' % (len(rps_sagh), Y_test.shape[1]))


2100 / 2195
2195 / 2195

In [16]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_sagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_sagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [17]:
sagh_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh), 
                                     'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in TOPs},
                                     'AUC': np.mean(aucs_sagh),
                                     'Spread': np.mean(spreads_sagh),
                                     'Novelty': {t: np.mean([np.mean(novelties_sagh[t][u]) 
                                                             for u in novelties_sagh[t]]) for t in TOPs},
                                     'PTop': np.mean(ptops_sagh),
                                     # 'Artist-Diversity': {t: np.mean(artist_diversities_sagh[t]) for t in TOPs},
                                     # 'Genre-Diversity': {t: np.mean(genre_diversities_sagh[t]) for t in TOPs}},
                                    },
                            'Test_All': {'R-Precision': rps_sagh, 
                                         'Hit-Rate': {top: hitrates_sagh[top] for top in TOPs},
                                         'AUC': aucs_sagh,
                                         'Spread': spreads_sagh,
                                         'Novelty': novelties_sagh,
                                         'PTop': ptops_sagh,
                                         # 'Artist-Diversity': artist_diversities_sagh,
                                         # 'Genre-Diversity': genre_diversities_sagh}}}
                                        }}}
sagh_perf[dataset_name]['Test']


Out[17]:
{'R-Precision': 0.0452645392008858,
 'Hit-Rate': {5: 0.02635875113186022,
  10: 0.045496088620980986,
  20: 0.07606360445870414,
  30: 0.09861250709516567,
  50: 0.13239572961049764,
  100: 0.1813284612206776,
  200: 0.22717142363225395,
  300: 0.25187588947116424,
  500: 0.2757905842759898,
  700: 0.28873514860458266,
  1000: 0.2995276029801112},
 'AUC': 0.6450527914435169,
 'Spread': 10.326562735197296,
 'Novelty': {5: -7.264256802379904,
  10: -6.905920319602588,
  20: -6.423426350202923,
  30: -6.072672093643402,
  50: -5.540663129776558,
  100: -4.685427723970891,
  200: -3.7278987461349784,
  300: -3.18750649526425,
  500: -2.5793628921778686,
  700: -2.2859353866779566,
  1000: -2.033143122718425},
 'PTop': 0.008706126423116696}

In [18]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh.pkl')
print(fperf_sagh)
pkl.dump(sagh_perf, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting3/perf-sagh.pkl
Out[18]:
{'R-Precision': 0.0452645392008858,
 'Hit-Rate': {5: 0.02635875113186022,
  10: 0.045496088620980986,
  20: 0.07606360445870414,
  30: 0.09861250709516567,
  50: 0.13239572961049764,
  100: 0.1813284612206776,
  200: 0.22717142363225395,
  300: 0.25187588947116424,
  500: 0.2757905842759898,
  700: 0.28873514860458266,
  1000: 0.2995276029801112},
 'AUC': 0.6450527914435169,
 'Spread': 10.326562735197296,
 'Novelty': {5: -7.264256802379904,
  10: -6.905920319602588,
  20: -6.423426350202923,
  30: -6.072672093643402,
  50: -5.540663129776558,
  100: -4.685427723970891,
  200: -3.7278987461349784,
  300: -3.18750649526425,
  500: -2.5793628921778686,
  700: -2.2859353866779566,
  1000: -2.033143122718425},
 'PTop': 0.008706126423116696}

Popularity based recommendation


In [15]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []
novelties_pop = {top: dict() for top in TOPs}
ptops_pop = []
# artist_diversities_pop = {top: [] for top in TOPs}
# genre_diversities_pop = {top: [] for top in TOPs}
np.random.seed(0)

y_pred = np.array([song2pop_train[index2song[ix]] for ix in range(len(all_songs))])
y_pred_prob = softmax(np.log(y_pred))
spread_pop = -np.dot(y_pred_prob, np.log(y_pred_prob))
sortix = np.argsort(-y_pred)

assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
    if (j+1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_pop.append(rp)
    for top in TOPs:
        hitrates_pop[top].append(hr_dict[top])
    aucs_pop.append(auc)

    # novelty
    u = test_playlists[j][1]
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
        try:
            novelties_pop[top][u].append(nov)
        except KeyError:
            novelties_pop[top][u] = [nov]
            
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    npos = y_true.sum()
    assert npos > 0
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos
    ptops_pop.append(pt)

    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_pop[top].append( diversity(artist_vec) )
#         genre_diversities_pop[top].append( diversity(genre_vec) )
    
print('\n%d / %d' % (len(rps_pop), Y_test.shape[1]))


2190 / 2195
2195 / 2195

In [16]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_pop, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_pop, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [17]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop), 
                                    'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in TOPs},
                                    'AUC': np.mean(aucs_pop),
                                    'Spread': spread_pop,
                                    'Novelty': {t: np.mean([np.mean(novelties_pop[t][u]) for u in novelties_pop[t]]) 
                                                for t in TOPs},
                                    'PTop': np.mean(ptops_pop),
                                    #'Artist-Diversity': {top: np.mean(artist_diversities_pop[top]) for top in TOPs},
                                    #'Genre-Diversity': {top: np.mean(genre_diversities_pop[top]) for top in TOPs}},
                                   },
                           'Test_All': {'R-Precision': rps_pop, 
                                        'Hit-Rate': {top: hitrates_pop[top] for top in TOPs},
                                        'AUC': aucs_pop,
                                        'Spread': spread_pop,
                                        'Novelty': novelties_pop,
                                        'PTop': ptops_pop,
                                        # 'Artist-Diversity': artist_diversities_pop,
                                        # 'Genre-Diversity': genre_diversities_pop}}}
                                       }}}
pop_perf[dataset_name]['Test']


Out[17]:
{'R-Precision': 0.021789486717889057,
 'Hit-Rate': {5: 0.010145439453664305,
  10: 0.0190906029791239,
  20: 0.035023385784414565,
  30: 0.04652897408134314,
  50: 0.06902868603298959,
  100: 0.11953569058765297,
  200: 0.1921951014338093,
  300: 0.24737848161229312,
  500: 0.32612280951457073,
  700: 0.3905026295790617,
  1000: 0.462820533654615},
 'AUC': 0.9402710379314311,
 'Spread': 9.800781689204657,
 'Novelty': {5: -8.406420692558143,
  10: -8.2740968572769,
  20: -8.127371393547135,
  30: -8.010500208389617,
  50: -7.856553487793756,
  100: -7.6026583564350805,
  200: -7.283651706691955,
  300: -7.052911271895382,
  500: -6.728155958468345,
  700: -6.4973974242965795,
  1000: -6.231012799438416},
 'PTop': 0.0027028189198567555}

In [18]:
fperf_pop = os.path.join(data_dir, 'perf-pop.pkl')
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting3/perf-pop.pkl
Out[18]:
{'R-Precision': 0.021789486717889057,
 'Hit-Rate': {5: 0.010145439453664305,
  10: 0.0190906029791239,
  20: 0.035023385784414565,
  30: 0.04652897408134314,
  50: 0.06902868603298959,
  100: 0.11953569058765297,
  200: 0.1921951014338093,
  300: 0.24737848161229312,
  500: 0.32612280951457073,
  700: 0.3905026295790617,
  1000: 0.462820533654615},
 'AUC': 0.9402710379314311,
 'Spread': 9.800781689204657,
 'Novelty': {5: -8.406420692558143,
  10: -8.2740968572769,
  20: -8.127371393547135,
  30: -8.010500208389617,
  50: -7.856553487793756,
  100: -7.6026583564350805,
  200: -7.283651706691955,
  300: -7.052911271895382,
  500: -6.728155958468345,
  700: -6.4973974242965795,
  1000: -6.231012799438416},
 'PTop': 0.0027028189198567555}

Matrix Factorisation

Let $S \in \mathbb{R}^{M \times D}, V \in \mathbb{R}^{U \times D}$ be the latent factors of songs and users, respectively. Let $R \in \mathbb{R}^{M \times U}$ be the play-count of songs for users, and $T = \mathbf{1}(R > 0) \in \{0,1\}^{M \times U}$ be a binary matrix.

The optimisation objective: $ \begin{aligned} J = \sum{m=1}^M \sum{u=1}^U q{m, u} \left( t{m,u} - \mathbf{s}_m^\top \mathbf{v}_u \right)^2

+ C \left( \sum_{m=1}^M \mathbf{s}_m^\top \mathbf{s}_m + \sum_{u=1}^U \mathbf{v}_n^\top \mathbf{v}_n \right)

\end{aligned} $ where $\alpha, \epsilon$ are hyper-parameters, and $q{m, u} = 1 + \alpha \log(1 + \epsilon^{-1} r{m,u})$.

Use alternating least squares optimisation method:

  1. Fix $S$, then let $ \begin{aligned} \mathbf{0} = \frac{\partial J}{\partial \mathbf{v}_u} = \sum_{m=1}^M 2 q_{m,u} \left( t_{m,u} - \mathbf{s}_m^\top \mathbf{v}_u \right) (-\mathbf{s}_m) + 2 C \mathbf{v}_u \end{aligned} $
    in other words $ \begin{aligned} \sum_{m=1}^M q_{m,u} t_{m,u} \mathbf{s}_m = \sum_{m=1}^M q_{m,u} (\mathbf{s}_m^\top \mathbf{v}_u^*) \mathbf{s}_m + C \mathbf{v}_u^* = \sum_{m=1}^M q_{m,u} \mathbf{s}_m \mathbf{s}_m^\top \mathbf{v}_u^* + C \mathbf{v}_u^* = \left( \sum_{m=1}^M q_{m,u} \mathbf{s}_m \mathbf{s}_m^\top + C \mathbf{I} \right) \mathbf{v}_u^* \end{aligned} $
    where $\mathbf{I} \in \mathbb{R}^{D \times D}$ is the identity matrix.
    So $ \begin{aligned} \mathbf{v}_u^* = \left( \sum_{m=1}^M q_{m,u} \mathbf{s}_m \mathbf{s}_m^\top + C \mathbf{I} \right)^{-1} \sum_{m=1}^M q_{m,u} t_{m,u} \mathbf{s}_m \end{aligned} $
    or equivalently $ \begin{aligned} \mathbf{v}_u^* = \left( (\mathbf{q}_{:,u}[..., \text{np.newaxis}] \times S)^\top S + C \mathbf{I} \right)^{-1} \left( (\mathbf{q}_{:u} \circ \mathbf{t}_{:,u})^\top S \right)^\top \end{aligned} $
    where np.newaxis is for numpy broadcasting and $\circ$ is entrywise product of vector/matrix.
    It seems we have to use 3-dimension tensor (U by M by D) if we want to compute all values at once, this could be impractical due to memory usage.
  1. Fix $V$, then let $ \begin{aligned} \mathbf{0} = \frac{\partial J}{\partial \mathbf{s}_m} = \sum_{u=1}^U 2 q_{m,u} \left( t_{m,u} - \mathbf{s}_m^\top \mathbf{v}_u \right) (-\mathbf{v}_u) + 2 C \mathbf{s}_m \end{aligned} $
    by symmetry, we have
    $ \begin{aligned} \mathbf{s}_m^* = \left( (\mathbf{q}_{m:}[\text{np.newaxis}, ...] \times V)^\top V + C \mathbf{I} \right)^{-1} \left( (\mathbf{q}_{m:} \circ \mathbf{t}_{m,:})^\top V \right)^\top \end{aligned} $

In [15]:
users = sorted({u for _, u in train_playlists})
user2index = {u: uix for uix, u in enumerate(users)}
print(len(users))


8070

In [16]:
M = X.shape[0]
U = len(users)

R = np.zeros((M, U))
for pix, (_, u) in tqdm(enumerate(train_playlists)):
    R[:, user2index[u]] += Y_train[:, pix].A.reshape(-1)
R.shape


15262it [00:12, 1200.27it/s]
Out[16]:
(45468, 8070)

In [17]:
T = (R > 0).astype(np.float32)
T.shape


Out[17]:
(45468, 8070)

In [18]:
assert torch.cuda.is_available()
device = torch.device('cuda:0')

In [19]:
D = 100
C = 1e-5
alpha = 40
eps = 1e-8

torch.manual_seed(0)
S = torch.rand(M, D).to(device)
V = torch.rand(U, D).to(device)

Q = 1 + alpha * (np.log(1 / eps) + np.log(eps + R))  # M by U, dense
QT = np.multiply(Q, T).astype(np.float32)  # M by U
QT_csc = csc_matrix(QT)
QT_csr = csr_matrix(QT)
Q = torch.from_numpy(Q.astype(np.float32)).to(device)
del QT


/home/users/u5708856/apps/miniconda3/lib/python3.6/site-packages/torch/cuda/__init__.py:116: UserWarning: 
    Found GPU1 Quadro K600 which is of cuda capability 3.0.
    PyTorch no longer supports this GPU because it is too old.
    
  warnings.warn(old_gpu_warn % (d, name, major, capability[1]))

In [20]:
n_sweeps = 20

# alternating least squares
for sweep in range(n_sweeps):
    t0 = time.time()
    vdiff2 = 0.
    sdiff2 = 0.
    
    # fix S, optimise V
    S_cpu = S.cpu().numpy()
    for uix in range(U):
        if (uix + 1) % 100 == 0:
            sys.stdout.write('\r%d / %d' % (uix+1, U))
            sys.stdout.flush()
        QSu = torch.mm((Q[:, uix].reshape(M, 1) * S).t(), S)  # D by D
        QSu[range(D), range(D)] = C + QSu.diag()
        QTuS = torch.from_numpy(QT_csc[:, uix].transpose().dot(S_cpu).reshape(D, 1)).to(device)
        v_u = torch.mm(torch.inverse(QSu), QTuS).reshape(-1)
        diff = v_u - V[uix, :]
        vdiff2 += torch.mm(diff.reshape(1, -1), diff.reshape(-1, 1)).cpu().item()
        V[uix, :] = v_u

    print()
    
    # fix V, optimise S
    V_cpu = V.cpu().numpy()
    for m in range(M):
        if (m + 1) % 100 == 0:
            sys.stdout.write('\r%d / %d' % (m+1, M))
            sys.stdout.flush()
        QVm = torch.mm((Q[m, :].reshape(U, 1) * V).t(), V)  # D by D
        QVm[range(D), range(D)] = C + QVm.diag()
        QTmV = torch.from_numpy(QT_csr[m, :].dot(V_cpu).reshape(D, 1)).to(device)
        s_m = torch.mm(torch.inverse(QVm), QTmV).reshape(-1)
        diff = s_m - S[m, :]
        sdiff2 += torch.mm(diff.reshape(1, -1), diff.reshape(-1, 1)).cpu().item()
        S[m, :] = s_m
    print('\nV diff: {:8.6f}, S diff: {:8.6f} in {:.1f} seconds.'
          .format(np.sqrt(vdiff2), np.sqrt(sdiff2), time.time() - t0))


8000 / 8070
45400 / 45468
V diff: 521.322937, S diff: 1006.677539 in 291.3 seconds.
8000 / 8070
45400 / 45468
V diff: 27.668184, S diff: 145.611794 in 240.4 seconds.
8000 / 8070
45400 / 45468
V diff: 15.029595, S diff: 90.212691 in 233.6 seconds.
8000 / 8070
45400 / 45468
V diff: 10.327211, S diff: 67.395220 in 252.1 seconds.
8000 / 8070
45400 / 45468
V diff: 7.904257, S diff: 52.229260 in 242.2 seconds.
8000 / 8070
45400 / 45468
V diff: 6.400798, S diff: 42.176519 in 255.1 seconds.
8000 / 8070
45400 / 45468
V diff: 5.366733, S diff: 35.256144 in 246.3 seconds.
8000 / 8070
45400 / 45468
V diff: 4.610273, S diff: 30.242341 in 224.9 seconds.
8000 / 8070
45400 / 45468
V diff: 4.033070, S diff: 26.439593 in 225.8 seconds.
8000 / 8070
45400 / 45468
V diff: 3.578605, S diff: 23.448129 in 225.9 seconds.
8000 / 8070
45400 / 45468
V diff: 3.211968, S diff: 21.034353 in 224.0 seconds.
8000 / 8070
45400 / 45468
V diff: 2.910349, S diff: 19.047010 in 224.4 seconds.
8000 / 8070
45400 / 45468
V diff: 2.658185, S diff: 17.385883 in 222.6 seconds.
8000 / 8070
45400 / 45468
V diff: 2.444423, S diff: 15.977725 in 224.0 seconds.
8000 / 8070
45400 / 45468
V diff: 2.261050, S diff: 14.770885 in 221.1 seconds.
8000 / 8070
45400 / 45468
V diff: 2.102114, S diff: 13.724324 in 220.2 seconds.
8000 / 8070
45400 / 45468
V diff: 1.963108, S diff: 12.809409 in 224.1 seconds.
8000 / 8070
45400 / 45468
V diff: 1.840540, S diff: 12.001982 in 222.5 seconds.
8000 / 8070
45400 / 45468
V diff: 1.731719, S diff: 11.284453 in 221.1 seconds.
8000 / 8070
45400 / 45468
V diff: 1.634455, S diff: 10.643604 in 219.6 seconds.

Sanity check: compute optimisation objective.


In [21]:
cost = np.multiply(Q.cpu().numpy(), np.square(T - torch.mm(S, V.t()).cpu().numpy())).sum() \
       + C * (torch.mul(S, S).sum().cpu().item() + torch.mul(V, V).sum().cpu().item())
print('Cost: %g' % cost)


Cost: 4.94277e+06

In [22]:
# del QT
# T = torch.from_numpy(T).to(device)
# cost = torch.mul(Q, (T - torch.mm(S, V.t())) ** 2).sum() + C * (torch.mul(S, S).sum() + torch.mul(V, V).sum())
# print('Cost: %g' % cost.cpu().item())

In [23]:
del T

Performance on test set


In [25]:
rps_mf = []
hitrates_mf = {top: [] for top in TOPs}
aucs_mf = []
spreads_mf = []
novelties_mf = {top: dict() for top in TOPs}
ptops_mf = []
# artist_diversities_mf = {top: [] for top in TOPs}
# genre_diversities_mf = {top: [] for top in TOPs}
np.random.seed(0)

npos = Y_test.sum(axis=0).A.reshape(-1)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()

    assert npos[j] > 0
    y_true = Y_test[:, j].A.reshape(-1)
    
    u = test_playlists[j][1]
    uix = user2index[u]
    y_pred = torch.mm(S, V[uix, :].reshape(D, 1)).reshape(-1).cpu().numpy()

    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_mf.append(rp)
    for top in TOPs:
        hitrates_mf[top].append(hr_dict[top])
    # auc = roc_auc_score(y_true, y_pred)
    aucs_mf.append(auc)
    
    # spread
    y_pred_prob = softmax(y_pred)
    spreads_mf.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))

    # novelty
    sortix = np.argsort(-y_pred)
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
        try:
            novelties_mf[top][u].append(nov)
        except KeyError:
            novelties_mf[top][u] = [nov]
    
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos[j]
    ptops_mf.append(pt)
    
    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_mf[top].append( diversity(artist_vec) )
#         genre_diversities_mf[top].append( diversity(genre_vec) )
    
print('\n%d / %d' % (len(rps_mf), Y_test.shape[1]))


2100 / 2195
2195 / 2195

In [26]:
perf_mf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_mf), 
                                'Hit-Rate': {top: np.mean(hitrates_mf[top]) for top in TOPs},
                                'AUC': np.mean(aucs_mf),
                                'Spread': np.mean(spreads_mf),
                                'Novelty': {t: np.mean([np.mean(novelties_mf[t][u]) for u in novelties_mf[t]]) 
                                            for t in TOPs},
                                'PTop': np.mean(ptops_mf),
                                # 'Artist-Diversity': {top: np.mean(artist_diversities_mf[top]) for top in TOPs},
                                # 'Genre-Diversity': {top: np.mean(genre_diversities_mf[top]) for top in TOPs}},
                                  },
                        'Test_All': {'R-Precision': rps_mf,
                                    'Hit-Rate': {top: hitrates_mf[top] for top in TOPs},
                                    'AUC': aucs_mf,
                                    'Spread': spreads_mf,
                                    'Novelty': novelties_mf,
                                    'PTop': ptops_mf,
                                    # 'Artist-Diversity': artist_diversities_mf,
                                    # 'Genre-Diversity': genre_diversities_mf}}}
                                    }}}
perf_mf[dataset_name]['Test']


Out[26]:
{'R-Precision': 0.049377764803008574,
 'Hit-Rate': {5: 0.02926413021841258,
  10: 0.05145004242622205,
  20: 0.08365600180191513,
  30: 0.10832635049146337,
  50: 0.14684679166554612,
  100: 0.20944700009287984,
  200: 0.2839112515396292,
  300: 0.3305675286049512,
  500: 0.393693238383796,
  700: 0.4332726753350746,
  1000: 0.4786869565897743},
 'AUC': 0.795183426779958,
 'Spread': 10.714413,
 'Novelty': {5: -6.018945667786334,
  10: -5.72709257141705,
  20: -5.536654436754104,
  30: -5.466014811385599,
  50: -5.377907076684237,
  100: -5.239391290556176,
  200: -5.006387650968637,
  300: -4.817936851371754,
  500: -4.521864744646671,
  700: -4.295592416792713,
  1000: -4.03507939188452},
 'PTop': 0.008486400283950701}

In [27]:
fperf_mf = os.path.join(data_dir, 'perf-mf.pkl')
print(fperf_mf)
pkl.dump(perf_mf, open(fperf_mf, 'wb'))
pkl.load(open(fperf_mf, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting3/perf-mf.pkl
Out[27]:
{'R-Precision': 0.049377764803008574,
 'Hit-Rate': {5: 0.02926413021841258,
  10: 0.05145004242622205,
  20: 0.08365600180191513,
  30: 0.10832635049146337,
  50: 0.14684679166554612,
  100: 0.20944700009287984,
  200: 0.2839112515396292,
  300: 0.3305675286049512,
  500: 0.393693238383796,
  700: 0.4332726753350746,
  1000: 0.4786869565897743},
 'AUC': 0.795183426779958,
 'Spread': 10.714413,
 'Novelty': {5: -6.018945667786334,
  10: -5.72709257141705,
  20: -5.536654436754104,
  30: -5.466014811385599,
  50: -5.377907076684237,
  100: -5.239391290556176,
  200: -5.006387650968637,
  300: -4.817936851371754,
  500: -4.521864744646671,
  700: -4.295592416792713,
  1000: -4.03507939188452},
 'PTop': 0.008486400283950701}