Use a nice subset of AotM-2011 Playlists with MSD Audio Features


In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys
import gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from scipy.sparse import lil_matrix, issparse
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sys.path.append('src')
from BinaryRelevance import BinaryRelevance
from PClassificationMLC import PClassificationMLC
from evaluate import calc_F1, calc_precisionK, f1_score_nowarn
from PCMLC import PCMLC

In [3]:
data_dir = 'data/aotm-2011'
#faotm = os.path.join(data_dir, 'aotm2011-subset.pkl')
faotm = os.path.join(data_dir, 'aotm2011-user-playlist.pkl')
ffeature = 'data/msd/songID2Features.pkl.gz'

Load playlists

Load playlists.


In [4]:
user_playlists = pkl.load(open(faotm, 'rb'))

In [5]:
print('#user    :', len(user_playlists))
print('#playlist:', np.sum([len(user_playlists[u]) for u in user_playlists]))


#user    : 14182
#playlist: 84710

In [ ]:
pl_lengths = [len(pl) for u in user_playlists for pl in user_playlists[u]]
#plt.hist(pl_lengths, bins=100)
print('Average playlist length: %.1f' % np.mean(pl_lengths))

In [ ]:
users = sorted(user_playlists.keys())

In [ ]:
songs_user = {u: {sid for pl in user_playlists[u] for sid in pl} for u in users}  # user: a set of songs

Compute the number of playlists per user, and the number of songs covered by the user's playlists.


In [ ]:
udf = pd.DataFrame(index=users, columns=['#playlist', '#song'])

In [ ]:
udf['#playlist'] = [len(user_playlists[u]) for u in users]

In [ ]:
udf['#song'] = [len(songs_user[u]) for u in users]

In [ ]:
udf_subset = udf[udf['#playlist'] == 50]
udf_subset

In [ ]:
#udf.sort_values(by=['#playlist'], ascending=False).iloc[100:200]

In [ ]:
uid_subset = udf_subset.index[0]
uid_subset

In [ ]:
#udf[uid_subset]  # tuple are used as multiindex in pandas
#udf[[uid_subset]]

Subset of data

The user whose playlists cover a proper number of playlists, e.g. 50.


In [ ]:
playlists_subset = user_playlists[uid_subset]

In [ ]:
song_set = sorted(songs_user[uid_subset])

In [ ]:
len(song_set)

In [ ]:
#song_set

Load song features

Load song_id --> feature array mapping: map a song to the audio features of one of its corresponding tracks in MSD.


In [ ]:
song2Features = pkl.load(gzip.open(ffeature, 'rb'))

The set of songs, which is the set of labels in this formulation.

Split song-playlist matrix

Songs as rows, playlists as columns, split rows.


In [ ]:
def gen_dataset_subset(playlists, song_set, features_MSD):
    """
    Create labelled dataset: rows are songs, columns are users.
    
    Input:
        - playlists: a set of playlists
        - song_set: a set of songIDs
        - features_MSD: dictionary that maps songIDs to features from MSD
    Output:
        - (Feature, Label) pair (X, Y)
          X: #songs by #features
          Y: #songs by #users
    """
    song_indices = {sid: ix for ix, sid in enumerate(song_set)}
    N = len(song_set)
    K = len(playlists)
    
    X = np.array([features_MSD[sid] for sid in song_set])
    Y = np.zeros((N, K), dtype=np.bool)
    
    for k in range(K):
        pl = playlists[k]
        indices = [song_indices[sid] for sid in pl]
        Y[indices, k] = True

    return X, Y

In [ ]:
X, Y = gen_dataset_subset(playlists=playlists_subset, song_set=song_set, features_MSD=song2Features)

# data split: approximately 80/20 for training/dev
X_train, X_dev, Y_train, Y_dev = train_test_split(X, Y, test_size=0.2, random_state=8976321)

# feature normalisation
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = 10 * np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev   -= X_train_mean
X_dev   /= X_train_std

In [ ]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))

In [ ]:
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 0.1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 0.1)

BR - Independent logistic regression

Independent logistic regression.


In [ ]:
br = BinaryRelevance(n_jobs=4)
br.fit(X_train, Y_train)

In [ ]:
Y_br = br.predict(X_dev)

In [ ]:
np.mean(calc_precisionK(Y_dev.T, Y_br.T))

In [ ]:
f1_score_nowarn(Y_dev.ravel(), (Y_br>=0).ravel(), average='binary')

In [ ]:
precision_recall_fscore_support(Y_dev.ravel(), (Y_br>=0).ravel(), average='binary', warn_for=None)

In [ ]:
#np.mean(calc_F1(Y_train, br.predict(X_train) >= 0))

In [ ]:
%%script false
min_pos_score = []
for col in range(Y_dev.shape[1]):
    val = Y_br[:,col][Y_dev[:,col]]
    if len(val) > 0:
        min_pos_score.append(np.min(val))
    else:
        min_pos_score.append(np.nan)
print(np.array(min_pos_score))

In [ ]:
%%script false
max_neg_score = []
for col in range(Y_dev.shape[1]):
    val = Y_br[:,col][np.logical_not(Y_dev[:,col])]
    if len(val) > 0:
        max_neg_score.append(np.max(val))
print(np.array(max_neg_score))

In [ ]:
#print(np.array(min_pos_score)-np.array(max_neg_score))

PC - Multilabel p-classification

P-Classification ~ P-norm push ranking.


In [ ]:
#pc1 = PClassificationMLC(C=1, weighting=True, verticalWeighting=True)
pc1 = PCMLC(C=1, weighting=True, verticalWeighting=True)
pc1.fit(X_train, Y_train)

In [ ]:
X_test = X_dev
Y_test = Y_dev

In [ ]:
np.sum(Y_dev, axis=0)

In [ ]:
Y_pc = pc1.predict(X_test)

In [ ]:
np.mean(calc_precisionK(Y_test.T, Y_pc.T))

In [ ]:
precision_recall_fscore_support(Y_dev.ravel(), (Y_pc>=0).ravel(), average='binary', warn_for=None)

In [ ]:
#np.mean(calc_F1(Y_dev, Y_pc >= 0))

In [ ]:
#np.mean(calc_F1(Y_train, pc.predict(X_train) >= 0))

In [ ]:
min_pos_score = []
for col in range(Y_test.shape[1]):
    val = Y_pc[:,col][Y_test[:,col]]
    if len(val) > 0:
        min_pos_score.append(np.min(val))
    else:
        min_pos_score.append(np.nan)
#plt.hist((np.array(min_pos_score)))
#plt.hist((np.nan_to_num(min_pos_score)), bins=30)
#print(np.array(min_pos_score))
#print()

In [ ]:
max_neg_score = []
for col in range(Y_test.shape[1]):
    val = Y_pc[:,col][np.logical_not(Y_test[:,col])]
    if len(val) > 0:
        max_neg_score.append(np.max(val))
#plt.hist(np.array(max_neg_score), bins=30)
#print()

In [ ]:
#plt.hist(np.nan_to_num(min_pos_score)-np.array(max_neg_score), bins=30)
#print()

Multilabel p-classification with unknows in test set


In [ ]:
N, K = Y.shape

In [ ]:
#type(np.nan)

In [ ]:
Y_nan = Y.copy().astype(np.float)
np.random.seed(8967321)
rand_num = int(0.2 * N)
ones = 0
for k in range(K):
    randix = np.random.permutation(np.arange(N))[:rand_num]
    Y_nan[randix, k] = np.nan
    ones += Y[randix, k].sum()

In [ ]:
#np.sum(Y, axis=0)

In [ ]:
#np.nansum(Y_nan, axis=0)

In [ ]:
#np.sum(Y, axis=0) - np.nansum(Y_nan, axis=0)

In [ ]:
#ones  # number of positive entries selected to be masked as NaN

In [ ]:
#Y.shape

In [ ]:
#Y_nan.shape

The number of NaN entries.


In [ ]:
np.sum(np.isnan(Y_nan))

In [ ]:
#np.sum(Y)

Train: keep running util no overflow warning occurred.


In [ ]:
pc2 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc2.fit(X, Y_nan)

Prediction: use the minimum of positive entry score of the same example as threshold.
Evaluation: use F1 on all unknown entries (as a 1D array).


In [ ]:
Y_pred2 = pc2.predict(X)

In [ ]:
pos_index = np.nan_to_num(Y_nan).astype(np.bool)
nan_index = np.isnan(Y_nan)

In [ ]:
ground_truths = Y[nan_index]

In [ ]:
thresholds = []
preds = []
for k in range(K):
    val = Y_pred2[:, k][pos_index[:, k]]
    th = np.min(val)
    thresholds.append(th)
    preds += (Y_pred2[nan_index[:,k], k] >= th).tolist()

In [ ]:
f1_score_nowarn(ground_truths, preds, average='binary')

In [ ]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)

Multilabel p-classification with some playlist fully observed


In [ ]:
N, K = Y.shape

In [ ]:
Y_nan_part = Y.copy().astype(np.float)
np.random.seed(8967321)
nan_num = int(0.4 * N)
indices = np.arange(N)[-nan_num:]
ones = 0
for k in range(int(K/2), K):
    Y_nan_part[indices, k] = np.nan
    ones += Y[indices, k].sum()

In [ ]:
#np.sum(Y, axis=0)

In [ ]:
#np.nansum(Y_nan_part, axis=0)

In [ ]:
#np.sum(Y, axis=0) - np.nansum(Y_nan_part, axis=0)

In [ ]:
#np.sum(np.isnan(Y_nan_part),axis=0)

In [ ]:
#ones  # number of positive entries selected to be masked as NaN

In [ ]:
#Y.shape

In [ ]:
#Y_nan.shape

The number of NaN entries.


In [ ]:
np.sum(np.isnan(Y_nan_part))

In [ ]:
#np.sum(Y)

Train: keep running util no overflow warning occurred.


In [ ]:
pc3 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc3.fit(X, Y_nan_part)

Prediction: use the minimum of positive entry score of the same example as threshold.
Evaluation: use F1 on all unknown entries (as a 1D array).


In [ ]:
Y_pred3 = pc3.predict(X)

In [ ]:
pos_index = np.nan_to_num(Y_nan_part).astype(np.bool)
nan_index = np.isnan(Y_nan_part)

In [ ]:
ground_truths = Y[nan_index]

In [ ]:
thresholds = []
preds = []
for k in range(int(K/2), K):
    val = Y_pred3[:, k][pos_index[:, k]]
    th = np.min(val)
    #th = np.mean(val)
    thresholds.append(th)
    preds += (Y_pred3[nan_index[:,k], k] >= th).tolist()

In [ ]:
#np.sum(ground_truths)

In [ ]:
#np.sum(preds)

In [ ]:
f1_score_nowarn(ground_truths, preds, average='binary')

In [ ]:
len(preds)

In [ ]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)

Multilabel p-classification with (some playlist fully observed) and (unknowns in test set)


In [ ]:
N, K = Y.shape

In [ ]:
Y_nan_part2 = Y.copy().astype(np.float)
np.random.seed(8967321)
rand_num = int(0.4 * N)
ones = 0
for k in range(int(K/2), K):
    randix = np.random.permutation(np.arange(N))[:rand_num]
    Y_nan_part2[randix, k] = np.nan
    ones += Y[randix, k].sum()

In [ ]:
#np.sum(Y, axis=0)

In [ ]:
#np.nansum(Y_nan_part, axis=0)

In [ ]:
#np.sum(Y, axis=0) - np.nansum(Y_nan_part, axis=0)

In [ ]:
#ones  # number of positive entries selected to be masked as NaN

In [ ]:
#Y.shape

In [ ]:
#Y_nan.shape

The number of NaN entries.


In [ ]:
np.sum(np.isnan(Y_nan_part2))

In [ ]:
#np.sum(Y)

In [ ]:
pc4 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc4.fit(X, Y_nan_part2)

Prediction: use the minimum of positive entry score of the same example as threshold.
Evaluation: use F1 on all unknown entries (as a 1D array).


In [ ]:
Y_pred4 = pc4.predict(X)

In [ ]:
pos_index = np.nan_to_num(Y_nan_part2).astype(np.bool)
nan_index = np.isnan(Y_nan_part2)

In [ ]:
ground_truths = Y[nan_index]

In [ ]:
thresholds = []
preds = []
for k in range(int(K/2), K):
    val = Y_pred4[:, k][pos_index[:, k]]
    th = np.min(val)
    #th = np.mean(val)
    thresholds.append(th)
    preds += (Y_pred4[nan_index[:,k], k] >= th).tolist()

In [ ]:
#np.sum(ground_truths)

In [ ]:
#np.sum(preds)

In [ ]:
f1_score_nowarn(ground_truths, preds, average='binary')

In [ ]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)

In [ ]:
np.sum(ground_truths)

In [ ]:
np.sum(preds)

In [ ]:
np.sum(np.logical_and(ground_truths, preds))

In [ ]: