Baselines - playlist generation for known users


In [1]:
%matplotlib inline

import os, sys, time, gzip
import pickle as pkl
import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix, issparse, csc_matrix, csr_matrix
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# from tools import calc_RPrecision_HitRate
from tools import calc_metrics, diversity, softmax

In [3]:
TOPs = [5, 10, 20, 30, 50, 100, 200, 300, 500, 700, 1000]

In [4]:
datasets = ['aotm2011', '30music']

In [5]:
dix = 1
dataset_name = datasets[dix]
dataset_name


Out[5]:
'30music'

In [6]:
data_dir = 'data/%s/coldstart/setting4' % dataset_name
X = pkl.load(gzip.open(os.path.join(data_dir, 'X.pkl.gz'), 'rb'))
Y_train = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train.pkl.gz'), 'rb'))
Y_test = pkl.load(gzip.open(os.path.join(data_dir, 'Y_test.pkl.gz'), 'rb'))
song2pop_train = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop_train.pkl.gz'), 'rb'))

In [7]:
playlists3 = pkl.load(gzip.open(os.path.join(data_dir, 'playlists_train_test_s4.pkl.gz'), 'rb'))
train_playlists = playlists3['train_playlists']
test_playlists = playlists3['test_playlists']

In [8]:
all_songs = pkl.load(gzip.open(os.path.join(data_dir, 'all_songs.pkl.gz'), 'rb'))
index2song = {ix: sid for ix, (sid, _) in enumerate(all_songs)}

In [9]:
song2index = {sid: ix for ix, (sid, _) in enumerate(all_songs)}

In [10]:
_song2artist = pkl.load(gzip.open('data/msd/song2artist.pkl.gz', 'rb'))
song2artist = {sid: _song2artist[sid] for sid, _ in all_songs if sid in _song2artist}

In [11]:
artist2songs = dict()

for sid in sorted(song2artist):
    artist = song2artist[sid]
    try:
        artist2songs[artist].append(sid)
    except KeyError:
        artist2songs[artist] = [sid]

In [12]:
print('{:,} | {:,}'.format(len(song2artist), len(artist2songs)))


45,468 | 9,981

In [13]:
artist2pop = dict()

for pl, _ in train_playlists:
    for sid in pl:
        if sid in song2artist:
            aid = song2artist[sid]
            try:
                artist2pop[aid] += 1
            except KeyError:
                artist2pop[aid] = 1

In [14]:
print(len(artist2pop))


9981

In [15]:
song2genre = pkl.load(gzip.open('data/msd/song2genre.pkl.gz', 'rb'))

In [16]:
song2pop = pkl.load(gzip.open(os.path.join(data_dir, 'song2pop.pkl.gz'), 'rb'))

Collocated Artists - Greatest Hits (CAGH), Top 10 Artists

Compute the similarity of two artist $a_1$ and $a_2$ given a set of playlist $P$:
$$ \text{sim}(a_1, a_2) = \frac{\sum_{p \in P} \delta(a_1, p) \times \delta(a_2, p)} {\sqrt{\sum_{p \in P} \delta(a_1, p) \times \sum_{p \in P} \delta(a_2, p)}} $$ where $$ \delta(a, p) = \begin{cases} 1, \ \text{at least one song in playlist $p$ is from artist $a$}, \\ 0, \ \text{otherwise}. \end{cases} $$

Recommend according to the popularity of songs, but weighted by similarity of (top 10 artists, artist of song).


In [17]:
all_artist = sorted(set([song2artist[sid] for pl, _ in train_playlists for sid in pl if sid in song2artist]))

In [18]:
artist2index = {aid: ix for ix, aid in enumerate(all_artist)}

In [19]:
Na = len(all_artist)
Np = len(train_playlists)
Delta = lil_matrix((Na, Np), dtype=np.float)
for j in range(Np):
    pl_artist = sorted(set([song2artist[sid] for sid in train_playlists[j][0] if sid in song2artist]))
    ix = [artist2index[aid] for aid in pl_artist]
    Delta[ix, j] = 1

In [20]:
Delta = Delta.tocsr()
Dsum = Delta.sum(axis=1).A.reshape(-1)
ColloMat = Delta.dot(Delta.T).A

assert np.all(np.isclose(ColloMat.diagonal(), Dsum))

In [21]:
print(len(Dsum), len(all_artist))


9981 9981

In [22]:
#type(ColloMat)

In [23]:
T1 = 1. / np.sqrt(Dsum)
NormMat = np.dot(T1.reshape(Na, 1), T1.reshape(1, Na))

WeightMat = np.multiply(ColloMat, NormMat)

In [24]:
rps_cagh = []
hitrates_cagh = {top: [] for top in TOPs}
aucs_cagh = []
novelties_cagh = {top: dict() for top in TOPs}
ptops_cagh = []
# artist_diversities_cagh = {top: [] for top in TOPs}
# genre_diversities_cagh = {top: [] for top in TOPs}
np.random.seed(0)

assert Y_test.shape[1] == len(test_playlists)

sid_legal = [sid for sid, _ in all_songs if sid in song2artist]
aix_legal = [artist2index[song2artist[sid]] for sid in sid_legal]
pop_legal = np.asarray([song2pop_train[sid] for sid in sid_legal])
ix_legal = [song2index[sid] for sid in sid_legal]

top10_artists = sorted(artist2pop, key=lambda aid: artist2pop[aid])[-10:]
top10_artists_ix = [artist2index[aix] for aix in top10_artists]
y_pred = np.zeros(Y_test.shape[0])
y_pred[ix_legal] = np.log(pop_legal) * np.asarray([WeightMat[aix, top10_artists_ix].sum() for aix in aix_legal])

y_pred_prob = softmax(y_pred)
spread_cagh = -np.dot(y_pred_prob, np.log(y_pred_prob))
sortix = np.argsort(-y_pred)

for j in range(Y_test.shape[1]):
    if (j + 1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_cagh.append(rp)
    for top in TOPs:
        hitrates_cagh[top].append(hr_dict[top])
    aucs_cagh.append(auc)
    
    # novelty
    u = test_playlists[j][1]
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
        try:
            novelties_cagh[top][u].append(nov)
        except KeyError:
            novelties_cagh[top][u] = [nov]
            
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    npos = y_true.sum()
    assert npos > 0
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos
    ptops_cagh.append(pt)
    
    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_cagh[top].append( diversity(artist_vec) )
#         genre_diversities_cagh[top].append( diversity(genre_vec) )

print('\n%d / %d' % (len(rps_cagh), Y_test.shape[1]))


3390 / 3390
3390 / 3390

In [25]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_cagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_cagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [26]:
cagh = {dataset_name: {'Test': {'R-Precision': np.mean(rps_cagh), 
                                'Hit-Rate': {top: np.mean(hitrates_cagh[top]) for top in TOPs},
                                'AUC': np.mean(aucs_cagh),
                                'Spread': spread_cagh,
                                'Novelty': {t: np.mean([np.mean(novelties_cagh[t][u]) 
                                                        for u in novelties_cagh[t]]) for t in TOPs},
                                'PTop': np.mean(ptops_cagh),
                                # 'Artist-Diversity': {t: np.mean(artist_diversities_cagh[t]) for t in TOPs},
                                # 'Genre-Diversity': {t: np.mean(genre_diversities_cagh[t]) for t in TOPs}},
                               },
                       'Test_All': {'R-Precision': rps_cagh, 
                                    'Hit-Rate': {top: hitrates_cagh[top] for top in TOPs},
                                    'AUC': aucs_cagh,
                                    'Spread': spread_cagh,
                                    'Novelty': novelties_cagh,
                                    'PTop': ptops_cagh,
                                    # 'Artist-Diversity': artist_diversities_cagh,
                                    # 'Genre-Diversity': genre_diversities_cagh}}}
                                   }}}
cagh[dataset_name]['Test']


Out[26]:
{'R-Precision': 0.01888252139764334,
 'Hit-Rate': {5: 0.00938561519158546,
  10: 0.01572263751842785,
  20: 0.02534186251584444,
  30: 0.03258846833600736,
  50: 0.04900519217497452,
  100: 0.07072537518556539,
  200: 0.09193169553829512,
  300: 0.12787903588292893,
  500: 0.1771391444749642,
  700: 0.22561080673198688,
  1000: 0.28822054570313776},
 'AUC': 0.8630670319003828,
 'Spread': 4.1655933352494365,
 'Novelty': {5: -8.185462101695036,
  10: -7.965696610547907,
  20: -7.6989764025668554,
  30: -7.5403224343729685,
  50: -7.340238954177793,
  100: -6.750895588251327,
  200: -5.962041526922837,
  300: -5.744536110879032,
  500: -5.447525515755583,
  700: -5.326314293460358,
  1000: -5.185188858182191},
 'PTop': 0.003145728156957101}

In [27]:
fperf_cagh = os.path.join(data_dir, 'perf-cagh.pkl')
print(fperf_cagh)
pkl.dump(cagh, open(fperf_cagh, 'wb'))
pkl.load(open(fperf_cagh, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting4/perf-cagh.pkl
Out[27]:
{'R-Precision': 0.01888252139764334,
 'Hit-Rate': {5: 0.00938561519158546,
  10: 0.01572263751842785,
  20: 0.02534186251584444,
  30: 0.03258846833600736,
  50: 0.04900519217497452,
  100: 0.07072537518556539,
  200: 0.09193169553829512,
  300: 0.12787903588292893,
  500: 0.1771391444749642,
  700: 0.22561080673198688,
  1000: 0.28822054570313776},
 'AUC': 0.8630670319003828,
 'Spread': 4.1655933352494365,
 'Novelty': {5: -8.185462101695036,
  10: -7.965696610547907,
  20: -7.6989764025668554,
  30: -7.5403224343729685,
  50: -7.340238954177793,
  100: -6.750895588251327,
  200: -5.962041526922837,
  300: -5.744536110879032,
  500: -5.447525515755583,
  700: -5.326314293460358,
  1000: -5.185188858182191},
 'PTop': 0.003145728156957101}

Same Artists - Greatest Hits (SAGH), Top 10 Artists

Recommending according to the popularity of songs of the top 10 most popular artists in data.


In [17]:
rps_sagh = []
hitrates_sagh = {top: [] for top in TOPs}
aucs_sagh = []
novelties_sagh = {top: dict() for top in TOPs}
ptops_sagh = []
# artist_diversities_sagh = {top: [] for top in TOPs}
# genre_diversities_sagh = {top: [] for top in TOPs}
np.random.seed(0)

top10_artists = sorted(artist2pop, key=lambda aid: artist2pop[aid])[-10:]
candidates = []
for aix in top10_artists:
    candidates += artist2songs[aix]
candidates = sorted(set(candidates))

assert len(candidates) > 0
y_pred = np.zeros(Y_test.shape[0])
for sid in candidates:
    ix = song2index[sid]
    y_pred[ix] = np.log(song2pop_train[sid])

y_pred_prob = softmax(y_pred)
spread_sagh = -np.dot(y_pred_prob, np.log(y_pred_prob))
sortix = np.argsort(-y_pred)
    
assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_sagh.append(rp)
    for top in TOPs:
        hitrates_sagh[top].append(hr_dict[top])
    aucs_sagh.append(auc)
    
    # novelty
    u = test_playlists[j][1]
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
        try:
            novelties_sagh[top][u].append(nov)
        except KeyError:
            novelties_sagh[top][u] = [nov]
            
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    npos = y_true.sum()
    assert npos > 0
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos
    ptops_sagh.append(pt)
    
    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_sagh[top].append( diversity(artist_vec) )
#         genre_diversities_sagh[top].append( diversity(genre_vec) )
    
print('\n%d / %d' % (len(rps_sagh), Y_test.shape[1]))


3300 / 3390
3390 / 3390

In [18]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_sagh, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_sagh, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [19]:
sagh = {dataset_name: {'Test': {'R-Precision': np.mean(rps_sagh), 
                                'Hit-Rate': {top: np.mean(hitrates_sagh[top]) for top in TOPs},
                                'AUC': np.mean(aucs_sagh),
                                'Spread': spread_sagh,
                                'Novelty': {t: np.mean([np.mean(novelties_sagh[t][u]) 
                                                        for u in novelties_sagh[t]]) for t in TOPs},
                                'PTop': np.mean(ptops_sagh),
                                # 'Artist-Diversity': {t: np.mean(artist_diversities_sagh[t]) for t in TOPs},
                                # 'Genre-Diversity': {t: np.mean(genre_diversities_sagh[t]) for t in TOPs}},
                               },
                       'Test_All': {'R-Precision': rps_sagh, 
                                    'Hit-Rate': {top: hitrates_sagh[top] for top in TOPs},
                                    'AUC': aucs_sagh,
                                    'Spread': spread_sagh,
                                    'Novelty': novelties_sagh,
                                    'PTop': ptops_sagh,
                                    # 'Artist-Diversity': artist_diversities_sagh,
                                    # 'Genre-Diversity': genre_diversities_sagh}}}
                                   }}}
sagh[dataset_name]['Test']


Out[19]:
{'R-Precision': 0.019539033099329386,
 'Hit-Rate': {5: 0.009864948733442841,
  10: 0.01668802746277457,
  20: 0.02976315121789779,
  30: 0.03921742056223373,
  50: 0.05305102710593436,
  100: 0.07251453629786718,
  200: 0.0884805814287678,
  300: 0.09574546708482436,
  500: 0.09983024902262343,
  700: 0.10270036029225334,
  1000: 0.10759298806789613},
 'AUC': 0.5447781428478882,
 'Spread': 9.989773075426719,
 'Novelty': {5: -8.256290364105753,
  10: -8.114134680205186,
  20: -7.913046125788259,
  30: -7.7371587493803995,
  50: -7.461509911161343,
  100: -6.818909383162228,
  200: -5.9392965490504945,
  300: -5.311220097723328,
  500: -4.117784135346873,
  700: -3.2903991301560995,
  1000: -2.6907111730674433},
 'PTop': 0.0026354035723412904}

In [20]:
fperf_sagh = os.path.join(data_dir, 'perf-sagh.pkl')
print(fperf_sagh)
pkl.dump(sagh, open(fperf_sagh, 'wb'))
pkl.load(open(fperf_sagh, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting4/perf-sagh.pkl
Out[20]:
{'R-Precision': 0.019539033099329386,
 'Hit-Rate': {5: 0.009864948733442841,
  10: 0.01668802746277457,
  20: 0.02976315121789779,
  30: 0.03921742056223373,
  50: 0.05305102710593436,
  100: 0.07251453629786718,
  200: 0.0884805814287678,
  300: 0.09574546708482436,
  500: 0.09983024902262343,
  700: 0.10270036029225334,
  1000: 0.10759298806789613},
 'AUC': 0.5447781428478882,
 'Spread': 9.989773075426719,
 'Novelty': {5: -8.256290364105753,
  10: -8.114134680205186,
  20: -7.913046125788259,
  30: -7.7371587493803995,
  50: -7.461509911161343,
  100: -6.818909383162228,
  200: -5.9392965490504945,
  300: -5.311220097723328,
  500: -4.117784135346873,
  700: -3.2903991301560995,
  1000: -2.6907111730674433},
 'PTop': 0.0026354035723412904}

Popularity based recommendation


In [17]:
rps_pop = []
hitrates_pop = {top: [] for top in TOPs}
aucs_pop = []
novelties_pop = {top: dict() for top in TOPs}
ptops_pop = []
# artist_diversities_pop = {top: [] for top in TOPs}
# genre_diversities_pop = {top: [] for top in TOPs}
np.random.seed(0)

y_pred = np.array([song2pop_train[index2song[ix]] for ix in range(len(all_songs))])
y_pred_prob = softmax(np.log(y_pred))
spread_pop = -np.dot(y_pred_prob, np.log(y_pred_prob))
sortix = np.argsort(-y_pred)

assert Y_test.shape[1] == len(test_playlists)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()
    y_true = Y_test[:, j].A.reshape(-1)
    
    # rp, hr_dict = calc_RPrecision_HitRate(y_true, y_pred, tops=TOPs)
    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_pop.append(rp)
    for top in TOPs:
        hitrates_pop[top].append(hr_dict[top])
    aucs_pop.append(auc)
    
    # novelty
    u = test_playlists[j][1]
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
        try:
            novelties_pop[top][u].append(nov)
        except KeyError:
            novelties_pop[top][u] = [nov]
            
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    npos = y_true.sum()
    assert npos > 0
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos
    ptops_pop.append(pt)

    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_pop[top].append( diversity(artist_vec) )
#         genre_diversities_pop[top].append( diversity(genre_vec) )
    
print('\n%d / %d' % (len(rps_pop), Y_test.shape[1]))


3300 / 3390
3390 / 3390

In [18]:
# fig = plt.figure(figsize=[20, 5])
# ax1 = plt.subplot(131)
# ax1.hist(rps_pop, bins=100)
# ax1.set_yscale('log')
# ax1.set_title('R-Precision')
# #ax.set_xlim(0, xmax)
# ax2 = plt.subplot(132)
# ax2.hist(aucs_pop, bins=100)
# ax2.set_yscale('log')
# ax2.set_title('AUC')
# pass

In [19]:
pop_perf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_pop), 
                                    'Hit-Rate': {top: np.mean(hitrates_pop[top]) for top in TOPs},
                                    'AUC': np.mean(aucs_pop),
                                    'Spread': spread_pop,
                                    'Novelty': {t: np.mean([np.mean(novelties_pop[t][u]) for u in novelties_pop[t]]) 
                                                for t in TOPs},
                                    'PTop': np.mean(ptops_pop),
                                    #'Artist-Diversity': {top: np.mean(artist_diversities_pop[top]) for top in TOPs},
                                    #'Genre-Diversity': {top: np.mean(genre_diversities_pop[top]) for top in TOPs}},
                                   },
                           'Test_All': {'R-Precision': rps_pop, 
                                        'Hit-Rate': {top: hitrates_pop[top] for top in TOPs},
                                        'AUC': aucs_pop,
                                        'Spread': spread_pop,
                                        'Novelty': novelties_pop,
                                        'PTop': ptops_pop,
                                        # 'Artist-Diversity': artist_diversities_pop,
                                        # 'Genre-Diversity': genre_diversities_pop}}}
                                       }}}
pop_perf[dataset_name]['Test']


Out[19]:
{'R-Precision': 0.021428444265297445,
 'Hit-Rate': {5: 0.009421412623139338,
  10: 0.017693135842819865,
  20: 0.030350363239373485,
  30: 0.04236770309796627,
  50: 0.064210044410304,
  100: 0.1100898554000556,
  200: 0.16657225455059269,
  300: 0.21514997232353228,
  500: 0.2845075060090967,
  700: 0.33773431633772094,
  1000: 0.4002239301097651},
 'AUC': 0.8834105138312494,
 'Spread': 9.817103399959294,
 'Novelty': {5: -8.407434890467627,
  10: -8.259177501291077,
  20: -8.10651878102958,
  30: -8.005456564280452,
  50: -7.859830806890705,
  100: -7.602573864055904,
  200: -7.2744726606667305,
  300: -7.047383583766598,
  500: -6.721057334888038,
  700: -6.492052658348255,
  1000: -6.223838708538693},
 'PTop': 0.0023206785804504356}

In [20]:
fperf_pop = os.path.join(data_dir, 'perf-pop.pkl')
print(fperf_pop)
pkl.dump(pop_perf, open(fperf_pop, 'wb'))
pkl.load(open(fperf_pop, 'rb'))[dataset_name]['Test']


data/30music/coldstart/setting4/perf-pop.pkl
Out[20]:
{'R-Precision': 0.021428444265297445,
 'Hit-Rate': {5: 0.009421412623139338,
  10: 0.017693135842819865,
  20: 0.030350363239373485,
  30: 0.04236770309796627,
  50: 0.064210044410304,
  100: 0.1100898554000556,
  200: 0.16657225455059269,
  300: 0.21514997232353228,
  500: 0.2845075060090967,
  700: 0.33773431633772094,
  1000: 0.4002239301097651},
 'AUC': 0.8834105138312494,
 'Spread': 9.817103399959294,
 'Novelty': {5: -8.407434890467627,
  10: -8.259177501291077,
  20: -8.10651878102958,
  30: -8.005456564280452,
  50: -7.859830806890705,
  100: -7.602573864055904,
  200: -7.2744726606667305,
  300: -7.047383583766598,
  500: -6.721057334888038,
  700: -6.492052658348255,
  1000: -6.223838708538693},
 'PTop': 0.0023206785804504356}

Matrix Factorisation

Let $S \in \mathbb{R}^{M \times D}, V \in \mathbb{R}^{U \times D}$ be the latent factors of songs and users (in training set), respectively. Let $R \in \mathbb{R}^{M \times U}$ be the play-count of songs for users (in training set), and $T = \mathbf{1}(R > 0) \in \{0,1\}^{M \times U}$ be a binary matrix.

The optimisation objective: $ \begin{aligned} J = \sum{m=1}^M \sum{u=1}^U q{m, u} \left( t{m,u} - \mathbf{s}_m^\top \mathbf{v}_u \right)^2

+ C \left( \sum_{m=1}^M \mathbf{s}_m^\top \mathbf{s}_m + \sum_{u=1}^U \mathbf{v}_n^\top \mathbf{v}_n \right)

\end{aligned} $ where $\alpha, \epsilon$ are hyper-parameters, and $q{m, u} = 1 + \alpha \log(1 + \epsilon^{-1} r{m,u})$.

Use alternating least squares optimisation method:

  1. Fix $S$, then let $ \begin{aligned} \mathbf{0} = \frac{\partial J}{\partial \mathbf{v}_u} = \sum_{m=1}^M 2 q_{m,u} \left( t_{m,u} - \mathbf{s}_m^\top \mathbf{v}_u \right) (-\mathbf{s}_m) + 2 C \mathbf{v}_u \end{aligned} $
    in other words $ \begin{aligned} \sum_{m=1}^M q_{m,u} t_{m,u} \mathbf{s}_m = \sum_{m=1}^M q_{m,u} (\mathbf{s}_m^\top \mathbf{v}_u^*) \mathbf{s}_m + C \mathbf{v}_u^* = \sum_{m=1}^M q_{m,u} \mathbf{s}_m \mathbf{s}_m^\top \mathbf{v}_u^* + C \mathbf{v}_u^* = \left( \sum_{m=1}^M q_{m,u} \mathbf{s}_m \mathbf{s}_m^\top + C \mathbf{I} \right) \mathbf{v}_u^* \end{aligned} $
    where $\mathbf{I} \in \mathbb{R}^{D \times D}$ is the identity matrix.
    So $ \begin{aligned} \mathbf{v}_u^* = \left( \sum_{m=1}^M q_{m,u} \mathbf{s}_m \mathbf{s}_m^\top + C \mathbf{I} \right)^{-1} \sum_{m=1}^M q_{m,u} t_{m,u} \mathbf{s}_m \end{aligned} $
    or equivalently $ \begin{aligned} \mathbf{v}_u^* = \left( (\mathbf{q}_{:,u}[..., \text{np.newaxis}] \times S)^\top S + C \mathbf{I} \right)^{-1} \left( (\mathbf{q}_{:u} \circ \mathbf{t}_{:,u})^\top S \right)^\top \end{aligned} $
    where np.newaxis is for numpy broadcasting and $\circ$ is entrywise product of vector/matrix.
    It seems we have to use 3-dimension tensor (U by M by D) if we want to compute all values at once, this could be impractical due to memory usage.
  1. Fix $V$, then let $ \begin{aligned} \mathbf{0} = \frac{\partial J}{\partial \mathbf{s}_m} = \sum_{u=1}^U 2 q_{m,u} \left( t_{m,u} - \mathbf{s}_m^\top \mathbf{v}_u \right) (-\mathbf{v}_u) + 2 C \mathbf{s}_m \end{aligned} $
    by symmetry, we have
    $ \begin{aligned} \mathbf{s}_m^* = \left( (\mathbf{q}_{m:}[\text{np.newaxis}, ...] \times V)^\top V + C \mathbf{I} \right)^{-1} \left( (\mathbf{q}_{m:} \circ \mathbf{t}_{m,:})^\top V \right)^\top \end{aligned} $

In [17]:
assert dataset_name == '30music'

In [18]:
train_users = sorted({u for _, u in train_playlists})
train_user2index = {u: uix for uix, u in enumerate(train_users)}
print(len(train_users))


5649

In [19]:
M = X.shape[0]
U = len(train_users)

R = np.zeros((M, U))
for pix, (_, u) in tqdm(enumerate(train_playlists)):
    R[:, train_user2index[u]] += Y_train[:, pix].A.reshape(-1)
R.shape


14067it [00:10, 1337.61it/s]
Out[19]:
(45468, 5649)

In [20]:
T = (R > 0).astype(np.float32)
T.shape


Out[20]:
(45468, 5649)

In [21]:
assert torch.cuda.is_available()
device = torch.device('cuda:0')

In [22]:
D = 100
C = 1e-5
alpha = 40
eps = 1e-8

torch.manual_seed(0)
S = torch.rand(M, D).to(device)
V = torch.rand(U, D).to(device)

Q = 1 + alpha * (np.log(1 / eps) + np.log(eps + R))  # M by U, dense
QT = np.multiply(Q, T).astype(np.float32)  # M by U
QT_csc = csc_matrix(QT)
QT_csr = csr_matrix(QT)
Q = torch.from_numpy(Q.astype(np.float32)).to(device)
del QT


/home/users/u5708856/apps/miniconda3/lib/python3.6/site-packages/torch/cuda/__init__.py:116: UserWarning: 
    Found GPU1 Quadro K600 which is of cuda capability 3.0.
    PyTorch no longer supports this GPU because it is too old.
    
  warnings.warn(old_gpu_warn % (d, name, major, capability[1]))

In [23]:
n_sweeps = 20

# alternating least squares
for sweep in range(n_sweeps):
    t0 = time.time()
    vdiff2 = 0.
    sdiff2 = 0.
    
    # fix S, optimise V
    S_cpu = S.cpu().numpy()
    for uix in range(U):
        if (uix + 1) % 100 == 0:
            sys.stdout.write('\r%d / %d' % (uix+1, U))
            sys.stdout.flush()
        QSu = torch.mm((Q[:, uix].reshape(M, 1) * S).t(), S)  # D by D
        QSu[range(D), range(D)] = C + QSu.diag()
        QTuS = torch.from_numpy(QT_csc[:, uix].transpose().dot(S_cpu).reshape(D, 1)).to(device)
        v_u = torch.mm(torch.inverse(QSu), QTuS).reshape(-1)
        diff = v_u - V[uix, :]
        vdiff2 += torch.mm(diff.reshape(1, -1), diff.reshape(-1, 1)).cpu().item()
        V[uix, :] = v_u

    print()
    
    # fix V, optimise S
    V_cpu = V.cpu().numpy()
    for m in range(M):
        if (m + 1) % 100 == 0:
            sys.stdout.write('\r%d / %d' % (m+1, M))
            sys.stdout.flush()
        QVm = torch.mm((Q[m, :].reshape(U, 1) * V).t(), V)  # D by D
        QVm[range(D), range(D)] = C + QVm.diag()
        QTmV = torch.from_numpy(QT_csr[m, :].dot(V_cpu).reshape(D, 1)).to(device)
        s_m = torch.mm(torch.inverse(QVm), QTmV).reshape(-1)
        diff = s_m - S[m, :]
        sdiff2 += torch.mm(diff.reshape(1, -1), diff.reshape(-1, 1)).cpu().item()
        S[m, :] = s_m
    print('\nV diff: {:8.6f}, S diff: {:8.6f} in {:.1f} seconds.'
          .format(np.sqrt(vdiff2), np.sqrt(sdiff2), time.time() - t0))


5600 / 5649
45400 / 45468
V diff: 436.278195, S diff: 1024.695076 in 205.5 seconds.
5600 / 5649
45400 / 45468
V diff: 23.487190, S diff: 145.043530 in 203.1 seconds.
5600 / 5649
45400 / 45468
V diff: 13.216787, S diff: 91.205938 in 203.1 seconds.
5600 / 5649
45400 / 45468
V diff: 9.241411, S diff: 68.507168 in 203.7 seconds.
5600 / 5649
45400 / 45468
V diff: 7.128352, S diff: 52.597159 in 204.5 seconds.
5600 / 5649
45400 / 45468
V diff: 5.802107, S diff: 42.214396 in 203.1 seconds.
5600 / 5649
45400 / 45468
V diff: 4.884201, S diff: 35.203325 in 202.9 seconds.
5600 / 5649
45400 / 45468
V diff: 4.209020, S diff: 30.180051 in 206.9 seconds.
5600 / 5649
45400 / 45468
V diff: 3.691421, S diff: 26.396446 in 204.7 seconds.
5600 / 5649
45400 / 45468
V diff: 3.282319, S diff: 23.429591 in 203.5 seconds.
5600 / 5649
45400 / 45468
V diff: 2.951144, S diff: 21.033568 in 204.0 seconds.
5600 / 5649
45400 / 45468
V diff: 2.677741, S diff: 19.054410 in 207.8 seconds.
5600 / 5649
45400 / 45468
V diff: 2.448329, S diff: 17.389146 in 215.6 seconds.
5600 / 5649
45400 / 45468
V diff: 2.253175, S diff: 15.970046 in 229.2 seconds.
5600 / 5649
45400 / 45468
V diff: 2.085247, S diff: 14.746630 in 229.7 seconds.
5600 / 5649
45400 / 45468
V diff: 1.939347, S diff: 13.682306 in 230.5 seconds.
5600 / 5649
45400 / 45468
V diff: 1.811542, S diff: 12.749323 in 229.2 seconds.
5600 / 5649
45400 / 45468
V diff: 1.698779, S diff: 11.925980 in 223.1 seconds.
5600 / 5649
45400 / 45468
V diff: 1.598667, S diff: 11.195439 in 211.4 seconds.
5600 / 5649
45400 / 45468
V diff: 1.509277, S diff: 10.543625 in 289.5 seconds.

Sanity check: compute optimisation objective.


In [24]:
cost = np.multiply(Q.cpu().numpy(), np.square(T - torch.mm(S, V.t()).cpu().numpy())).sum() \
       + C * (torch.mul(S, S).sum().cpu().item() + torch.mul(V, V).sum().cpu().item())
del T
print('Cost: %g' % cost)


Cost: 3.76196e+06

User attributes in training set


In [25]:
all_users = pd.read_csv('data/30music/users.csv', index_col='ID', sep=';')
all_users.head()


Out[25]:
Timestamp #Playlists Age Country Gender Playcount Subscribertype Username
ID
1 1116715959 2 24.0 US F 221012 base 000123
2 1163123792 9 39.0 CZ M 217535 base 000333
3 1184426573 2 NaN NaN F 49733 base 00elen
4 1123157597 2 32.0 DE M 168054 base 00Eraser00
5 1171302116 2 23.0 UK M 45700 base 00fieldsy

In [26]:
train_users_df = all_users.loc[train_users]
train_users_df.shape


Out[26]:
(5649, 8)

Countries


In [27]:
country_set = set(train_users_df['Country'])
# country_set

In [28]:
country_set.remove(np.nan)

In [29]:
countries = sorted(country_set)
country2index = {c: ix for ix, c in enumerate(countries)}

Gender


In [30]:
gender_set = set(train_users_df['Gender'])
# gender_set

In [31]:
gender_set.remove(np.nan)

In [32]:
genders = sorted(gender_set)
gender2index = {g: ix for ix, g in enumerate(genders)}

Subscriber type


In [33]:
subscribe_set = set(train_users_df['Subscribertype'])
# subscribe_set

In [34]:
subscribes = sorted(subscribe_set)
subscribe2index = {s: ix for ix, s in enumerate(subscribes)}

Features: Age, Country, Gender, Subscribertype where we

  • normalise Age
  • use one-hot encoding of Country, Gender and Subscribertype
  • use 0 imputation

In [35]:
# `#Playlists`
#num_playlists = np.log10(train_users_df['#Playlists'])
#num_playlists_mean = np.mean(num_playlists)
#num_playlists_std = np.std(num_playlists)
#num_playlists = (num_playlists - num_playlists_mean) / num_playlists_std

# Age
ages = np.array([0 if np.isnan(x) or x < 1 else x for x in train_users_df['Age']])
ages_mean = np.mean(ages)
ages_std = np.std(ages)
ages = (ages - ages_mean) / ages_std

# Country
country_mat = np.zeros((len(train_users), len(countries)))
for uix, u in enumerate(train_users):
    c = train_users_df.loc[u, 'Country']
    if c in country2index:
        cix = country2index[c]
        country_mat[uix, cix] = 1

# Gender
gender_mat = np.zeros((len(train_users), len(genders)))
for uix, u in enumerate(train_users):
    g = train_users_df.loc[u, 'Gender']
    if g in gender2index:
        gix = gender2index[g]
        gender_mat[uix, gix] = 1
    
# Playcount
#playcounts = np.array([0 if np.isnan(x) or x < 1 else np.log10(x) for x in train_users_df['Playcount']])
#playcounts_mean = np.mean(playcounts)
#playcounts_std = np.std(playcounts)
#playcounts = (playcounts - playcounts_mean) / playcounts_std

# Subscribertype
subscribe_mat = np.zeros((len(train_users), len(subscribes)))
for uix, u in enumerate(train_users):
    s = train_users_df.loc[u, 'Subscribertype']
    six = subscribe2index[s]
    subscribe_mat[uix, six] = 1

In [36]:
user_features = np.hstack([ages.reshape(-1, 1), country_mat, gender_mat, subscribe_mat])
user_features.shape


Out[36]:
(5649, 129)

User attributes in test set


In [37]:
test_users = {u for _, u in test_playlists}
test_users = sorted(test_users - (test_users - set(all_users.index)))
test_user2index = {u: uix for uix, u in enumerate(test_users)}

In [38]:
# set(test_users) - set(all_users.index)

In [39]:
test_users_df = all_users.loc[test_users]
test_users_df.shape


Out[39]:
(2420, 8)

In [40]:
# `#Playlists`
#num_playlists_test = np.log10(test_users_df['#Playlists'])
#num_playlists_test = (num_playlists_test - num_playlists_mean) / num_playlists_std

# Age
ages_test = np.array([0 if np.isnan(x) or x < 1 else x for x in test_users_df['Age']])
ages_test = (ages_test - ages_mean) / ages_std

# Country
country_mat_test = np.zeros((len(test_users), len(countries)))
for uix, u in enumerate(test_users):
    c = test_users_df.loc[u, 'Country']
    if c in country2index:
        cix = country2index[c]
        country_mat_test[uix, cix] = 1

# Gender
gender_mat_test = np.zeros((len(test_users), len(genders)))
for uix, u in enumerate(test_users):
    g = test_users_df.loc[u, 'Gender']
    if g in gender2index:
        gix = gender2index[g]
        gender_mat_test[uix, gix] = 1
    
# Playcount
#playcounts_test = np.array([0 if np.isnan(x) or x < 1 else np.log10(x) for x in test_users_df['Playcount']])
#playcounts_test = (playcounts_test - playcounts_mean) / playcounts_std

# Subscribertype
subscribe_mat_test = np.zeros((len(test_users), len(subscribes)))
for uix, u in enumerate(test_users):
    s = test_users_df.loc[u, 'Subscribertype']
    six = subscribe2index[s]
    subscribe_mat_test[uix, six] = 1

In [41]:
user_features_test = np.hstack([ages_test.reshape(-1, 1), country_mat_test, gender_mat_test, subscribe_mat_test])
user_features_test.shape


Out[41]:
(2420, 129)

Cosine similarity between test user features and train user features


In [42]:
dot_prod = np.dot(user_features_test, user_features.T)
dot_prod.shape


Out[42]:
(2420, 5649)

In [43]:
norm_train = np.multiply(user_features, user_features).sum(axis=1).reshape(-1, 1)
norm_test = np.multiply(user_features_test, user_features_test).sum(axis=1).reshape(-1, 1)
norm_mat = np.dot(norm_test, norm_train.T)
norm_mat.shape


Out[43]:
(2420, 5649)

In [44]:
print(np.sum(norm_mat.ravel() > 0), '=?', np.prod(norm_mat.shape))


13670580 =? 13670580

In [45]:
cosine_similarities = np.divide(dot_prod, norm_mat)

In [47]:
pkl.dump([train_user2index, test_user2index, cosine_similarities], gzip.open('%s/user_sim.pkl.gz' % data_dir, 'wb'))

Performance on test set

Use the average factor of 100 nearest neighbours for a test user


In [67]:
k = 200
rps_mf = []
hitrates_mf = {top: [] for top in TOPs}
aucs_mf = []
spreads_mf = []
novelties_mf = {top: dict() for top in TOPs}
ptops_mf = []
# artist_diversities_mf = {top: [] for top in TOPs}
# genre_diversities_mf = {top: [] for top in TOPs}
np.random.seed(0)
npos = Y_test.sum(axis=0).A.reshape(-1)
for j in range(Y_test.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_test.shape[1]))
        sys.stdout.flush()

    assert npos[j] > 0
    y_true = Y_test[:, j].A.reshape(-1)
    
    u = test_playlists[j][1]
    if u not in test_users:
        continue
    
    # average factor of 10 nearest neighbours
    uix = test_user2index[u]
    factor_ix = np.argpartition(cosine_similarities[uix, :].reshape(-1), -k)[-k:]
    y_pred = torch.mm(S, V[factor_ix, :].mean(dim=0).reshape(D, 1)).reshape(-1).cpu().numpy()

    rp, hr_dict, auc = calc_metrics(y_true, y_pred, tops=TOPs)
    rps_mf.append(rp)
    for top in TOPs:
        hitrates_mf[top].append(hr_dict[top])
    # auc = roc_auc_score(y_true, y_pred)
    aucs_mf.append(auc)
    
    # spread
    y_pred_prob = softmax(y_pred)
    spreads_mf.append(-np.dot(y_pred_prob, np.log(y_pred_prob)))

    # novelty
    sortix = np.argsort(-y_pred)
    for top in TOPs:
        nov = np.mean([-np.log2(song2pop[index2song[ix]]) for ix in sortix[:top]])
        try:
            novelties_mf[top][u].append(nov)
        except KeyError:
            novelties_mf[top][u] = [nov]
    
    # PTop: (#pos ranked above the top-ranked negative) / #pos
    assert y_true.dtype == np.bool
    negIx = (1 - y_true).astype(np.bool)
    negMax = y_pred[negIx].max()
    pt = (y_pred[y_true] > negMax).sum() / npos[j]
    ptops_mf.append(pt)
    
    # artist/genre diversity
#     for top in TOPs:
#         artist_vec = np.array([song2artist[index2song[ix]] if index2song[ix] in song2artist
#                                else str(np.random.rand()) for ix in sortix[:top]])
#         genre_vec = np.array([song2genre[index2song[ix]] if index2song[ix] in song2genre \
#                               else str(np.random.rand()) for ix in sortix[:top]])
#         artist_diversities_mf[top].append( diversity(artist_vec) )
#         genre_diversities_mf[top].append( diversity(genre_vec) )
    
print('\n%d / %d' % (len(rps_mf), Y_test.shape[1]))


3300 / 3390
3389 / 3390

In [68]:
# print('AUC:', np.mean(aucs_mf))
# print({top: np.mean(hitrates_mf[top]) for top in TOPs})

In [69]:
perf_mf = {dataset_name: {'Test': {'R-Precision': np.mean(rps_mf),
                                'Hit-Rate': {top: np.mean(hitrates_mf[top]) for top in TOPs},
                                'AUC': np.mean(aucs_mf),
                                'Spread': np.mean(spreads_mf),
                                'Novelty': {t: np.mean([np.mean(novelties_mf[t][u]) for u in novelties_mf[t]]) 
                                            for t in TOPs},
                                'PTop': np.mean(ptops_mf),
                                # 'Artist-Diversity': {top: np.mean(artist_diversities_mf[top]) for top in TOPs},
                                # 'Genre-Diversity': {top: np.mean(genre_diversities_mf[top]) for top in TOPs}},
                                  },
                        'Test_All': {'R-Precision': rps_mf,
                                    'Hit-Rate': {top: hitrates_mf[top] for top in TOPs},
                                    'AUC': aucs_mf,
                                    'Spread': spreads_mf,
                                    'Novelty': novelties_mf,
                                    'PTop': ptops_mf,
                                    # 'Artist-Diversity': artist_diversities_mf,
                                    # 'Genre-Diversity': genre_diversities_mf}}}
                                    }}}
perf_mf[dataset_name]['Test']


Out[69]:
{'R-Precision': 0.023643532856730952,
 'Hit-Rate': {5: 0.011183730875065292,
  10: 0.020073863889749695,
  20: 0.03467224557213608,
  30: 0.04667466656721038,
  50: 0.06779320626913173,
  100: 0.10818556329847187,
  200: 0.17109838794580584,
  300: 0.21805549618867073,
  500: 0.28731175340501636,
  700: 0.34238602739446583,
  1000: 0.4050517861640513},
 'AUC': 0.8677202254927863,
 'Spread': 10.72455,
 'Novelty': {5: -8.033787006566996,
  10: -7.917739973769313,
  20: -7.753288204961613,
  30: -7.626889284885757,
  50: -7.455813960637479,
  100: -7.187429205948604,
  200: -6.85386398170347,
  300: -6.620386611483843,
  500: -6.286775605109602,
  700: -6.041158351980454,
  1000: -5.75100264121414},
 'PTop': 0.00317562697904186}

In [9]:
fperf_mf = os.path.join(data_dir, 'perf-mf.pkl')
print(fperf_mf)
pkl.dump(perf_mf, open(fperf_mf, 'wb'))
pkl.load(open(fperf_mf, 'rb'))[dataset_name]['Test']


Out[9]:
{'R-Precision': 0.023643532856730952,
 'Hit-Rate': {5: 0.011183730875065292,
  10: 0.020073863889749695,
  20: 0.03467224557213608,
  30: 0.04667466656721038,
  50: 0.06779320626913173,
  100: 0.10818556329847187,
  200: 0.17109838794580584,
  300: 0.21805549618867073,
  500: 0.28731175340501636,
  700: 0.34238602739446583,
  1000: 0.4050517861640513},
 'AUC': 0.8677202254927863,
 'Spread': 10.72455,
 'Novelty': {5: -8.033787006566996,
  10: -7.917739973769313,
  20: -7.753288204961613,
  30: -7.626889284885757,
  50: -7.455813960637479,
  100: -7.187429205948604,
  200: -6.85386398170347,
  300: -6.620386611483843,
  500: -6.286775605109602,
  700: -6.041158351980454,
  1000: -5.75100264121414},
 'PTop': 0.00317562697904186}