In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys
import gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from scipy.optimize import check_grad
from scipy.sparse import lil_matrix, issparse, hstack, vstack
from collections import Counter
import itertools as itt
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
np_settings0 = np.seterr(all='raise')
RAND_SEED = 1234567890
In [ ]:
datasets = ['aotm2011', '30music']
ffeature = 'data/msd/song2feature.pkl.gz'
fgenre = 'data/msd/song2genre.pkl.gz'
N_SEEDS = [10000, 5000]
In [ ]:
dix = 1
dataset_name = datasets[dix]
data_dir = 'data/%s' % dataset_name
fplaylist = os.path.join(data_dir, '%s-playlist.pkl.gz' % dataset_name)
print(dataset_name)
Load playlists.
In [ ]:
_all_playlists = pkl.load(gzip.open(fplaylist, 'rb'))
In [ ]:
# _all_playlists[0]
In [ ]:
all_playlists = []
if type(_all_playlists[0][1]) == tuple:
for pl, u in _all_playlists:
user = '%s_%s' % (u[0], u[1]) # user string
all_playlists.append((pl, user))
else:
all_playlists = _all_playlists
In [ ]:
# user_playlists = dict()
# for pl, u in all_playlists:
# try:
# user_playlists[u].append(pl)
# except KeyError:
# user_playlists[u] = [pl]
In [ ]:
# all_playlists = []
# for u in user_playlists:
# if len(user_playlists[u]) > 4:
# all_playlists += [(pl, u) for pl in user_playlists[u]]
In [ ]:
all_users = sorted(set({user for _, user in all_playlists}))
In [ ]:
print('#user : {:,}'.format(len(all_users)))
print('#playlist: {:,}'.format(len(all_playlists)))
In [ ]:
pl_lengths = [len(pl) for pl, _ in all_playlists]
plt.hist(pl_lengths, bins=100)
print('Average playlist length: %.1f' % np.mean(pl_lengths))
check duplicated songs in the same playlist.
In [ ]:
print('{:,} | {:,}'.format(np.sum(pl_lengths), np.sum([len(set(pl)) for pl, _ in all_playlists])))
Load song_id
--> feature array
mapping: map a song to the audio features of one of its corresponding tracks in MSD.
In [ ]:
song2feature = pkl.load(gzip.open(ffeature, 'rb'))
Song genres from MSD Allmusic Genre Dataset (Top MAGD) and tagtraum.
In [ ]:
song2genre = pkl.load(gzip.open(fgenre, 'rb'))
In [ ]:
_all_songs = sorted([(sid, int(song2feature[sid][-1])) for sid in {s for pl, _ in all_playlists for s in pl}],
key=lambda x: (x[1], x[0]))
print('{:,}'.format(len(_all_songs)))
In [ ]:
print('%.1f\n%.1f' % (len(_all_songs) * len(all_playlists) * 8 / (1024**2),
len(_all_songs) * (218 + len(all_users) - 1) * 8 / (1024**2)))
Randomise the order of song with the same age.
In [ ]:
song_age_dict = dict()
for sid, age in _all_songs:
age = int(age)
try:
song_age_dict[age].append(sid)
except KeyError:
song_age_dict[age] = [sid]
In [ ]:
all_songs = []
np.random.seed(RAND_SEED)
for age in sorted(song_age_dict.keys()):
all_songs += [(sid, age) for sid in np.random.permutation(song_age_dict[age])]
In [ ]:
pkl.dump(all_songs, gzip.open(os.path.join(data_dir, 'setting2/all_songs.pkl.gz'), 'wb'))
Check if all songs have genre info.
In [ ]:
print('#songs missing genre: {:,}'.format(len(all_songs) - np.sum([sid in song2genre for (sid, _) in all_songs])))
Song popularity.
In [ ]:
song2index = {sid: ix for ix, (sid, _) in enumerate(all_songs)}
song_pl_mat = lil_matrix((len(all_songs), len(all_playlists)), dtype=np.int8)
for j in range(len(all_playlists)):
pl = all_playlists[j][0]
ind = [song2index[sid] for sid in pl]
song_pl_mat[ind, j] = 1
In [ ]:
song_pop = song_pl_mat.tocsc().sum(axis=1)
In [ ]:
max_pop = np.max(song_pop)
max_pop
In [ ]:
song2pop = {sid: song_pop[song2index[sid], 0] for (sid, _) in all_songs}
In [ ]:
pkl.dump(song2pop, gzip.open(os.path.join(data_dir, 'setting2/song2pop.pkl.gz'), 'wb'))
deal with one outlier.
In [ ]:
# song_pop1 = song_pop.copy()
# maxix = np.argmax(song_pop)
# song_pop1[maxix] = 0
# clipped_max_pop = np.max(song_pop1) + 10 # second_max_pop + 10
# if max_pop - clipped_max_pop > 500:
# song_pop1[maxix] = clipped_max_pop
Split songs (60/20/20 train/dev/test split) the latest released (year) songs are in dev and test set.
In [ ]:
N_NEW_SONGS = N_SEEDS[dix]
print(N_NEW_SONGS)
In [ ]:
# dev_ratio = 0.1
# test_ratio = 0.1
# nsong_dev_test = int(len(all_songs) * (dev_ratio + test_ratio))
# train_song_set = all_songs[nsong_dev_test:]
# # shuffle songs in dev and test set
# np.random.seed(60)
# dev_test_ix = np.random.permutation(np.arange(nsong_dev_test))
# nsong_dev = int(len(all_songs) * dev_ratio)
# dev_song_set = [all_songs[ix] for ix in dev_test_ix[:nsong_dev]]
# test_song_set = [all_songs[ix] for ix in dev_test_ix[nsong_dev:]]
In [ ]:
nsong_test = N_NEW_SONGS
nsong_dev = N_NEW_SONGS
test_song_set = all_songs[:nsong_test]
dev_song_set = all_songs[nsong_test:nsong_test + nsong_dev]
train_song_set = all_songs[nsong_test + nsong_dev:]
In [ ]:
test_songs = set([sid for sid, _ in test_song_set])
cnt = 0
for pl, _ in all_playlists:
plset = set(pl)
ninter = len(plset & test_songs)
if ninter > 0 and ninter < len(plset):
cnt += 1
print('%d playlists in test set, among %d' % (cnt, len(all_playlists)))
In [ ]:
print('#songs in training set: {:,}, average song age: {:.2f} yrs'
.format(len(train_song_set), np.mean([t[1] for t in train_song_set])))
print('#songs in dev set : {:,}, average song age: {:.2f} yrs'
.format(len(dev_song_set), np.mean([t[1] for t in dev_song_set])))
print('#songs in test set : {:,}, average song age: {:.2f} yrs'
.format(len(test_song_set), np.mean([t[1] for t in test_song_set])))
In [ ]:
print('#songs: {:,} | {:,}'.format(len(all_songs), len({s for s in train_song_set + dev_song_set+test_song_set})))
In [ ]:
ax = plt.subplot(111)
ax.hist(song_pop, bins=100)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+10)
ax.set_title('Histogram of song popularity')
pass
In [ ]:
train_song_pop = [song2pop[sid] for (sid, _) in train_song_set]
#if np.max(train_song_pop) > clipped_max_pop:
# train_song_pop[np.argmax(train_song_pop)] = clipped_max_pop
ax = plt.subplot(111)
ax.hist(train_song_pop, bins=100)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+10)
ax.set_title('Histogram of song popularity in TRAINING set')
pass
In [ ]:
dev_song_pop = [song2pop[sid] for (sid, _) in dev_song_set]
#if np.max(dev_song_pop) > clipped_max_pop:
# dev_song_pop[np.argmax(dev_song_pop)] = clipped_max_pop
ax = plt.subplot(111)
ax.hist(dev_song_pop, bins=100)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+10)
ax.set_title('Histogram of song popularity in DEV set')
pass
In [ ]:
test_song_pop = [song2pop[sid] for (sid, _) in test_song_set]
#if np.max(test_song_pop) > clipped_max_pop:
# test_song_pop[np.argmax(test_song_pop)] = clipped_max_pop
ax = plt.subplot(111)
ax.hist(test_song_pop, bins=100)
ax.set_yscale('log')
ax.set_xlim(0, song_pop.max()+10)
ax.set_title('Histogram of song popularity in TEST set')
pass
Songs as rows, playlists as columns.
In [ ]:
def gen_dataset(playlists, song2feature, song2genre, train_song_set,
dev_song_set=[], test_song_set=[], song2pop_train=None):
"""
Create labelled dataset: rows are songs, columns are users.
Input:
- playlists: a set of playlists
- train_song_set: a list of songIDs in training set
- dev_song_set: a list of songIDs in dev set
- test_song_set: a list of songIDs in test set
- song2feature: dictionary that maps songIDs to features from MSD
- song2genre: dictionary that maps songIDs to genre
- song2pop_train: a dictionary that maps songIDs to its popularity
Output:
- (Feature, Label) pair (X, Y)
X: #songs by #features
Y: #songs by #users
"""
song_set = train_song_set + dev_song_set + test_song_set
N = len(song_set)
K = len(playlists)
genre_set = sorted({v for v in song2genre.values()})
genre2index = {genre: ix for ix, genre in enumerate(genre_set)}
def onehot_genre(songID):
"""
One-hot encoding of genres.
Data imputation:
- one extra entry for songs without genre info
- mean imputation
- sampling from the distribution of genre popularity
"""
num = len(genre_set) # + 1
vec = np.zeros(num, dtype=np.float)
if songID in song2genre:
genre_ix = genre2index[song2genre[songID]]
vec[genre_ix] = 1
else:
vec[:] = np.nan
#vec[-1] = 1
return vec
#X = np.array([features_MSD[sid] for sid in song_set]) # without using genre
#Y = np.zeros((N, K), dtype=np.bool)
X = np.array([np.concatenate([song2feature[sid], onehot_genre(sid)], axis=-1) for sid in song_set])
Y = lil_matrix((N, K), dtype=np.bool)
song2index = {sid: ix for ix, sid in enumerate(song_set)}
for k in range(K):
pl = playlists[k]
indices = [song2index[sid] for sid in pl if sid in song2index]
Y[indices, k] = True
# genre imputation
genre_ix_start = -len(genre_set)
genre_nan = np.isnan(X[:, genre_ix_start:])
genre_mean = np.nansum(X[:, genre_ix_start:], axis=0) / (X.shape[0] - np.sum(genre_nan, axis=0))
#print(np.nansum(X[:, genre_ix_start:], axis=0))
#print(genre_set)
#print(genre_mean)
for j in range(len(genre_set)):
X[genre_nan[:,j], j+genre_ix_start] = genre_mean[j]
# normalise the sum of all genres per song to 1
# X[:, -len(genre_set):] /= X[:, -len(genre_set):].sum(axis=1).reshape(-1, 1)
# NOTE: this is not necessary, as the imputed values are guaranteed to be normalised (sum to 1)
# due to the above method to compute mean genres.
# the log of song popularity
if song2pop_train is not None:
# for sid in song_set:
# assert sid in song2pop_train # trust the input
logsongpop = np.log([song2pop_train[sid]+1 for sid in song_set]) # deal with 0 popularity
X = np.hstack([X, logsongpop.reshape(-1, 1)])
#return X, Y
Y = Y.tocsr()
train_ix = [song2index[sid] for sid in train_song_set]
X_train = X[train_ix, :]
Y_train = Y[train_ix, :]
dev_ix = [song2index[sid] for sid in dev_song_set]
X_dev = X[dev_ix, :]
Y_dev = Y[dev_ix, :]
test_ix = [song2index[sid] for sid in test_song_set]
X_test = X[test_ix, :]
Y_test = Y[test_ix, :]
if len(dev_song_set) > 0:
if len(test_song_set) > 0:
return X_train, Y_train.tocsc(), X_dev, Y_dev.tocsc(), X_test, Y_test.tocsc()
else:
return X_train, Y_train.tocsc(), X_dev, Y_dev.tocsc()
else:
if len(test_song_set) > 0:
return X_train, Y_train.tocsc(), X_test, Y_test.tocsc()
else:
return X_train, Y_train.tocsc()
In [ ]:
pkl_dir = os.path.join(data_dir, 'setting1')
fsongs = os.path.join(pkl_dir, 'songs_train_dev_test_s1.pkl.gz')
fpl = os.path.join(pkl_dir, 'playlists_s1.pkl.gz')
fxtrain = os.path.join(pkl_dir, 'X_train.pkl.gz')
fytrain = os.path.join(pkl_dir, 'Y_train.pkl.gz')
fxdev = os.path.join(pkl_dir, 'X_dev.pkl.gz')
fydev = os.path.join(pkl_dir, 'Y_dev.pkl.gz')
fxtest = os.path.join(pkl_dir, 'X_test.pkl.gz')
fytest = os.path.join(pkl_dir, 'Y_test.pkl.gz')
fxtrndev = os.path.join(pkl_dir, 'X_train_dev.pkl.gz')
fytrndev = os.path.join(pkl_dir, 'Y_train_dev.pkl.gz')
fclique_train = os.path.join(pkl_dir, 'cliques_train.pkl.gz')
fclique_trndev = os.path.join(pkl_dir, 'cliques_trndev.pkl.gz')
In [ ]:
X_train, Y_train, X_dev, Y_dev, X_test, Y_test = gen_dataset(playlists = [t[0] for t in all_playlists],
song2feature = song2feature, song2genre = song2genre,
train_song_set = [t[0] for t in train_song_set],
dev_song_set = [t[0] for t in dev_song_set],
test_song_set = [t[0] for t in test_song_set])
Feature normalisation.
In [ ]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev -= X_train_mean
X_dev /= X_train_std
X_test -= X_train_mean
X_test /= X_train_std
In [ ]:
X_train_dev = np.vstack([X_train, X_dev])
Y_train_dev = vstack([Y_train.tolil(), Y_dev.tolil()]).tocsc().astype(np.bool)
# NOTE: explicitly set type of Y is necessary, see issue #9777 of scikit-learn
Remove playlists that have 0 songs in training set.
In [ ]:
ix_ytrain = np.where(Y_train.sum(axis=0).A.reshape(-1) > 0)[0]
In [ ]:
Y_train = Y_train[:, ix_ytrain]
Y_dev = Y_dev[:, ix_ytrain]
In [ ]:
print('%d playlists with 0 seed song (training set)' % \
np.where(Y_train.sum(axis=0).A.reshape(-1) == 0)[0].shape[0])
In [ ]:
print('%d playlists in dev set, among %d' % \
(Y_dev.shape[1] - np.where(Y_dev.sum(axis=0).A.reshape(-1) == 0)[0].shape[0], Y_dev.shape[1]))
Remove playlists that have 0 songs in training + dev set.
In [ ]:
ix_ytrndev = np.where(Y_train_dev.sum(axis=0).A.reshape(-1) > 0)[0]
In [ ]:
Y_train_dev = Y_train_dev[:, ix_ytrndev]
Y_test = Y_test[:, ix_ytrndev]
In [ ]:
print('%d playlists with 0 seed song (training + dev set)' % \
np.where(Y_train_dev.sum(axis=0).A.reshape(-1) == 0)[0].shape[0])
In [ ]:
print('%d playlists in test set, among %d' % \
(Y_test.shape[1] - np.where(Y_test.sum(axis=0).A.reshape(-1) == 0)[0].shape[0], Y_test.shape[1]))
Save data.
In [ ]:
print('Train : %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev : %15s %15s' % (X_dev.shape, Y_dev.shape))
print('Test : %15s %15s' % (X_test.shape, Y_test.shape))
print('Trndev: %15s %15s' % (X_train_dev.shape, Y_train_dev.shape))
In [ ]:
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 1)
print(np.mean(np.mean(X_train_dev, axis=0)))
print(np.mean( np.std(X_train_dev, axis=0)) - 1)
print(np.mean(np.mean(X_test, axis=0)))
print(np.mean( np.std(X_test, axis=0)) - 1)
In [ ]:
pkl.dump(X_train, gzip.open(fxtrain, 'wb'))
pkl.dump(Y_train, gzip.open(fytrain, 'wb'))
pkl.dump(X_dev, gzip.open(fxdev, 'wb'))
pkl.dump(Y_dev, gzip.open(fydev, 'wb'))
pkl.dump(X_test, gzip.open(fxtest, 'wb'))
pkl.dump(Y_test, gzip.open(fytest, 'wb'))
pkl.dump(X_train_dev, gzip.open(fxtrndev, 'wb'))
pkl.dump(Y_train_dev, gzip.open(fytrndev, 'wb'))
In [ ]:
pkl.dump({'train_song_set': train_song_set, 'dev_song_set': dev_song_set, 'test_song_set': test_song_set},
gzip.open(fsongs, 'wb'))
pkl.dump(all_playlists, gzip.open(fpl, 'wb'))
Indices of playlists from the same user (training set), playlists of the same user form a clique in the graph where playlists are nodes.
In [ ]:
user_of_playlists_train = [u for (_, u) in [all_playlists[ix] for ix in ix_ytrain]]
clique_list_train = []
for u in sorted(set(user_of_playlists_train)):
clique = np.where(u == np.array(user_of_playlists_train, dtype=np.object))[0]
clique_list_train.append(clique)
In [ ]:
pkl.dump(clique_list_train, gzip.open(fclique_train, 'wb'))
In [ ]:
clqsize = [len(clique) for clique in clique_list_train]
print(np.min(clqsize), np.max(clqsize), len(clqsize), np.sum(clqsize))
In [ ]:
# np.where('b' == np.array(['abcdefghi', 'b']))
In [ ]:
# unexpected comparison results, please use a single string instead of a tuple to represent a user
# np.where((968763600.0, 'Aguilar') == np.array([(968763600.0, 'Aguilar'), (1042808400.0, 'Aguilar')],
# dtype=np.object))
In [ ]:
assert np.all(np.arange(Y_train.shape[1]) == np.asarray(sorted([k for clq in clique_list_train for k in clq])))
Indices of playlists from the same user (training + dev set), playlists of the same user form a clique in the graph where playlists are nodes.
In [ ]:
user_of_playlists_trndev = [u for (_, u) in [all_playlists[ix] for ix in ix_ytrndev]]
clique_list_trndev = []
for u in sorted(set(user_of_playlists_trndev)):
clique = np.where(u == np.array(user_of_playlists_trndev, dtype=np.object))[0]
clique_list_trndev.append(clique)
In [ ]:
pkl.dump(clique_list_trndev, gzip.open(fclique_trndev, 'wb'))
In [ ]:
clqsize = [len(clique) for clique in clique_list_trndev]
print(np.min(clqsize), np.max(clqsize), len(clqsize), np.sum(clqsize))
In [ ]:
assert np.all(np.arange(Y_train_dev.shape[1]) == \
np.asarray(sorted([k for clq in clique_list_trndev for k in clq])))
Split playlists (60/10/30 train/dev/test split) uniformly at random.
Split each user's playlists (60/20/20 train/dev/test split) uniformly at random if the user has $5$ or more playlists.
In [ ]:
train_playlists = []
dev_playlists = []
test_playlists = []
In [ ]:
dev_ratio = 0.1
test_ratio = 0.3
npl_dev = int(dev_ratio * len(all_playlists))
npl_test = int(test_ratio * len(all_playlists))
np.random.seed(RAND_SEED)
pl_indices = np.random.permutation(len(all_playlists))
test_playlists = all_playlists[:npl_test]
dev_playlists = all_playlists[npl_test:npl_test + npl_dev]
train_playlists = all_playlists[npl_test + npl_dev:]
In [ ]:
# user_playlists = dict()
# for pl, u in all_playlists:
# try:
# user_playlists[u].append(pl)
# except KeyError:
# user_playlists[u] = [pl]
In [ ]:
# sanity check
# npl_all = np.sum([len(user_playlists[u]) for u in user_playlists])
# print('{:30s} {:,}'.format('#users:', len(user_playlists)))
# print('{:30s} {:,}'.format('#playlists:', npl_all))
# print('{:30s} {:.2f}'.format('Average #playlists per user:', npl_all / len(user_playlists)))
In [ ]:
# np.random.seed(RAND_SEED)
# for u in user_playlists:
# playlists_u = [(pl, u) for pl in user_playlists[u]]
# if len(user_playlists[u]) < 5:
# train_playlists += playlists_u
# else:
# npl_test = int(test_ratio * len(user_playlists[u]))
# npl_dev = int(dev_ratio * len(user_playlists[u]))
# pl_indices = np.random.permutation(len(user_playlists[u]))
# test_playlists += playlists_u[:npl_test]
# dev_playlists += playlists_u[npl_test:npl_test + npl_dev]
# train_playlists += playlists_u[npl_test + npl_dev:]
In [ ]:
print('{:30s} {:,}'.format('#playlist in training set:', len(train_playlists)))
print('{:30s} {:,}'.format('#playlist in dev set:', len(dev_playlists)))
print('{:30s} {:,}'.format('#playlist in test set:', len(test_playlists)))
In [ ]:
len(train_playlists) + len(dev_playlists)
In [ ]:
xmax = np.max([len(pl) for (pl, _) in all_playlists]) + 1
In [ ]:
ax = plt.subplot(111)
ax.hist([len(pl) for (pl, _) in train_playlists], bins=100)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
ax.set_title('Histogram of playlist length in TRAINING set')
pass
In [ ]:
ax = plt.subplot(111)
ax.hist([len(pl) for (pl, _) in dev_playlists], bins=100)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
ax.set_title('Histogram of playlist length in DEV set')
pass
In [ ]:
ax = plt.subplot(111)
ax.hist([len(pl) for (pl, _) in test_playlists], bins=100)
ax.set_yscale('log')
ax.set_xlim(0, xmax)
ax.set_title('Histogram of playlist length in TEST set')
pass
Hold the last half of songs for playlists in dev and test set.
In [ ]:
#dev_playlists_obs = [pl[:-int(len(pl)/2)] for (pl, _) in dev_playlists]
#dev_playlists_held = [pl[-int(len(pl)/2):] for (pl, _) in dev_playlists]
#test_playlists_obs = [pl[:-int(len(pl)/2)] for (pl, _) in test_playlists]
#test_playlists_held = [pl[-int(len(pl)/2):] for (pl, _) in test_playlists]
Keep the first $K=1,2,3,4$ songs for playlist in dev and test set.
In [ ]:
N_SEED_K = 1
In [ ]:
dev_playlists_obs = []
dev_playlists_held = []
test_playlists_obs = []
test_playlists_held = []
In [ ]:
# np.random.seed(135792468)
# for pl, _ in dev_playlists:
# npl = len(pl)
# k = np.random.choice(np.arange(1, npl))
# dev_playlists_obs.append(pl[:k])
# dev_playlists_held.append(pl[k:])
# for pl, _ in test_playlists:
# npl = len(pl)
# k = np.random.choice(np.arange(1, npl))
# test_playlists_obs.append(pl[:k])
# test_playlists_held.append(pl[k:])
In [ ]:
for pl, _ in dev_playlists:
npl = len(pl)
k = N_SEED_K
dev_playlists_obs.append(pl[:k])
dev_playlists_held.append(pl[k:])
for pl, _ in test_playlists:
npl = len(pl)
k = N_SEED_K
test_playlists_obs.append(pl[:k])
test_playlists_held.append(pl[k:])
In [ ]:
for ix in range(len(dev_playlists)):
assert np.all(dev_playlists[ix][0] == dev_playlists_obs[ix] + dev_playlists_held[ix])
for ix in range(len(test_playlists)):
assert np.all(test_playlists[ix][0] == test_playlists_obs[ix] + test_playlists_held[ix])
In [ ]:
print('DEV obs: {:,} | DEV held: {:,} \nTEST obs: {:,} | TEST held: {:,}'.format(
np.sum([len(ppl) for ppl in dev_playlists_obs]), np.sum([len(ppl) for ppl in dev_playlists_held]),
np.sum([len(ppl) for ppl in test_playlists_obs]), np.sum([len(ppl) for ppl in test_playlists_held])))
In [ ]:
song2pop_train = song2pop.copy()
song2pop_train_dev = song2pop.copy()
for ppl in dev_playlists_held:
for sid in ppl:
song2pop_train[sid] -= 1
for ppl in test_playlists_held:
for sid in ppl:
song2pop_train[sid] -= 1
song2pop_train_dev[sid] -= 1
In [ ]:
pkl_dir2 = os.path.join(data_dir, 'setting2')
fpl2 = os.path.join(pkl_dir2, 'playlists_train_dev_test_s2_%d.pkl.gz' % N_SEED_K)
fy2 = os.path.join(pkl_dir2, 'Y_%d.pkl.gz' % N_SEED_K)
fxtrain2 = os.path.join(pkl_dir2, 'X_train_%d.pkl.gz' % N_SEED_K)
fytrain2 = os.path.join(pkl_dir2, 'Y_train_%d.pkl.gz' % N_SEED_K)
fxtrndev2 = os.path.join(pkl_dir2, 'X_trndev_%d.pkl.gz' % N_SEED_K)
fytrndev2 = os.path.join(pkl_dir2, 'Y_trndev_%d.pkl.gz' % N_SEED_K)
fydev2 = os.path.join(pkl_dir2, 'PU_dev_%d.pkl.gz' % N_SEED_K)
fytest2 = os.path.join(pkl_dir2, 'PU_test_%d.pkl.gz' % N_SEED_K)
fclique21 = os.path.join(pkl_dir2, 'cliques_trndev_%d.pkl.gz' % N_SEED_K)
fclique22 = os.path.join(pkl_dir2, 'cliques_all_%d.pkl.gz' % N_SEED_K)
In [ ]:
X, Y = gen_dataset(playlists = [t[0] for t in train_playlists + dev_playlists + test_playlists],
song2feature = song2feature, song2genre = song2genre,
train_song_set = [t[0] for t in all_songs], song2pop_train=song2pop_train)
In [ ]:
X_train = X
assert X.shape[0] == len(song2pop_train_dev)
X_train_dev = X_train.copy()
X_train_dev[:, -1] = np.log([song2pop_train_dev[sid]+1 for sid, _ in all_songs])
In [ ]:
dev_cols = np.arange(len(train_playlists), len(train_playlists) + len(dev_playlists))
test_cols = np.arange(len(train_playlists) + len(dev_playlists), Y.shape[1])
assert len(dev_cols) == len(dev_playlists) == len(dev_playlists_obs)
assert len(test_cols) == len(test_playlists) == len(test_playlists_obs)
In [ ]:
pkl.dump({'train_playlists': train_playlists, 'dev_playlists': dev_playlists, 'test_playlists': test_playlists,
'dev_playlists_obs': dev_playlists_obs, 'dev_playlists_held': dev_playlists_held,
'test_playlists_obs': test_playlists_obs, 'test_playlists_held': test_playlists_held},
gzip.open(fpl2, 'wb'))
In [ ]:
song2index = {sid: ix for ix, sid in enumerate([t[0] for t in all_songs])}
Use dedicated sparse matrices to reprsent what entries are observed in dev and test set.
In [ ]:
Y_train = Y[:, :len(train_playlists)].tocsc()
Y_train_dev = Y[:, :len(train_playlists) + len(dev_playlists)].tocsc()
In [ ]:
PU_dev = lil_matrix((len(all_songs), len(dev_playlists)), dtype=np.bool)
PU_test = lil_matrix((len(all_songs), len(test_playlists)), dtype=np.bool)
num_known_dev = 0
for j in range(len(dev_playlists)):
if (j+1) % 1000 == 0:
sys.stdout.write('\r%d / %d' % (j+1, len(dev_playlists))); sys.stdout.flush()
rows = [song2index[sid] for sid in dev_playlists_obs[j]]
PU_dev[rows, j] = True
num_known_dev += len(rows)
num_known_test = 0
for j in range(len(test_playlists)):
if (j+1) % 1000 == 0:
sys.stdout.write('\r%d / %d' % (j+1, len(test_playlists))); sys.stdout.flush()
rows = [song2index[sid] for sid in test_playlists_obs[j]]
PU_test[rows, j] = True
num_known_test += len(rows)
PU_dev = PU_dev.tocsr()
PU_test = PU_test.tocsr()
In [ ]:
print('#unknown entries in DEV set: {:15,d} | {:15,d} \n#unknown entries in TEST set: {:15,d} | {:15,d}'.format(
np.prod(PU_dev.shape) - PU_dev.sum(), len(dev_playlists) * len(all_songs) - num_known_dev,
np.prod(PU_test.shape) - PU_test.sum(), len(test_playlists) * len(all_songs) - num_known_test))
In [ ]:
# print('#unknown entries in Setting I: {:,}'.format((len(dev_song_set) + len(test_song_set)) * Y.shape[1]))
Feature normalisation.
In [ ]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
In [ ]:
X_trndev_mean = np.mean(X_train_dev, axis=0).reshape((1, -1))
X_trndev_std = np.std(X_train_dev, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train_dev -= X_trndev_mean
X_train_dev /= X_trndev_std
In [ ]:
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 1)
print(np.mean(np.mean(X_train_dev, axis=0)))
print(np.mean( np.std(X_train_dev, axis=0)) - 1)
In [ ]:
print('All : %s' % str(Y.shape))
print('Train : %s, %s' % (X_train.shape, Y_train.shape))
print('Dev : %s' % str(PU_dev.shape))
print('Trndev: %s, %s' % (X_train_dev.shape, Y_train_dev.shape))
print('Test : %s' % str(PU_test.shape))
In [ ]:
pkl.dump(X_train, gzip.open(fxtrain2, 'wb'))
pkl.dump(Y_train, gzip.open(fytrain2, 'wb'))
pkl.dump(Y, gzip.open(fy2, 'wb'))
pkl.dump(X_train_dev, gzip.open(fxtrndev2, 'wb'))
pkl.dump(Y_train_dev, gzip.open(fytrndev2, 'wb'))
pkl.dump(PU_dev, gzip.open(fydev2, 'wb'))
pkl.dump(PU_test, gzip.open(fytest2, 'wb'))
Build the adjacent matrix of playlists (nodes) for setting II, playlists of the same user form a clique.
Cliques in train + dev set.
In [ ]:
pl_users = [u for (_, u) in train_playlists + dev_playlists]
cliques_train_dev = []
for u in sorted(set(pl_users)):
clique = np.where(u == np.array(pl_users, dtype=np.object))[0]
#if len(clique) > 1:
cliques_train_dev.append(clique)
In [ ]:
pkl.dump(cliques_train_dev, gzip.open(fclique21, 'wb'))
In [ ]:
clqsize = [len(clq) for clq in cliques_train_dev]
print(np.min(clqsize), np.max(clqsize), len(clqsize), np.sum(clqsize))
In [ ]:
assert np.all(np.arange(Y_train_dev.shape[1]) == np.asarray(sorted([k for clq in cliques_train_dev for k in clq])))
Cliques in train + dev + test set.
In [ ]:
pl_users = [u for (_, u) in train_playlists + dev_playlists + test_playlists]
clique_list2 = []
for u in sorted(set(pl_users)):
clique = np.where(u == np.array(pl_users, dtype=np.object))[0]
#if len(clique) > 1:
clique_list2.append(clique)
In [ ]:
pkl.dump(clique_list2, gzip.open(fclique22, 'wb'))
In [ ]:
clqsize = [len(clq) for clq in clique_list2]
print(np.min(clqsize), np.max(clqsize), len(clqsize), np.sum(clqsize))
In [ ]:
assert np.all(np.arange(Y.shape[1]) == np.asarray(sorted([k for clq in clique_list2 for k in clq])))