In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys
import gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from scipy.sparse import lil_matrix, issparse
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
sys.path.append('src')
from BinaryRelevance import BinaryRelevance
from PClassificationMLC import PClassificationMLC
from evaluate import calc_F1, calc_precisionK, f1_score_nowarn
from PCMLC import PCMLC
In [3]:
data_dir = 'data/aotm-2011'
#faotm = os.path.join(data_dir, 'aotm2011-subset.pkl')
faotm = os.path.join(data_dir, 'aotm2011-user-playlist.pkl')
ffeature = 'data/msd/songID2Features.pkl.gz'
Load playlists.
In [4]:
user_playlists = pkl.load(open(faotm, 'rb'))
In [5]:
print('#user :', len(user_playlists))
print('#playlist:', np.sum([len(user_playlists[u]) for u in user_playlists]))
In [ ]:
pl_lengths = [len(pl) for u in user_playlists for pl in user_playlists[u]]
#plt.hist(pl_lengths, bins=100)
print('Average playlist length: %.1f' % np.mean(pl_lengths))
In [ ]:
users = sorted(user_playlists.keys())
In [ ]:
songs_user = {u: {sid for pl in user_playlists[u] for sid in pl} for u in users} # user: a set of songs
Compute the number of playlists per user, and the number of songs covered by the user's playlists.
In [ ]:
udf = pd.DataFrame(index=users, columns=['#playlist', '#song'])
In [ ]:
udf['#playlist'] = [len(user_playlists[u]) for u in users]
In [ ]:
udf['#song'] = [len(songs_user[u]) for u in users]
In [ ]:
udf_subset = udf[udf['#playlist'] == 50]
udf_subset
In [ ]:
#udf.sort_values(by=['#playlist'], ascending=False).iloc[100:200]
In [ ]:
uid_subset = udf_subset.index[0]
uid_subset
In [ ]:
#udf[uid_subset] # tuple are used as multiindex in pandas
#udf[[uid_subset]]
The user whose playlists cover a proper number of playlists, e.g. 50.
In [ ]:
playlists_subset = user_playlists[uid_subset]
In [ ]:
song_set = sorted(songs_user[uid_subset])
In [ ]:
len(song_set)
In [ ]:
#song_set
Load song_id
--> feature array
mapping: map a song to the audio features of one of its corresponding tracks in MSD.
In [ ]:
song2Features = pkl.load(gzip.open(ffeature, 'rb'))
The set of songs, which is the set of labels in this formulation.
Songs as rows, playlists as columns, split rows.
In [ ]:
def gen_dataset_subset(playlists, song_set, features_MSD):
"""
Create labelled dataset: rows are songs, columns are users.
Input:
- playlists: a set of playlists
- song_set: a set of songIDs
- features_MSD: dictionary that maps songIDs to features from MSD
Output:
- (Feature, Label) pair (X, Y)
X: #songs by #features
Y: #songs by #users
"""
song_indices = {sid: ix for ix, sid in enumerate(song_set)}
N = len(song_set)
K = len(playlists)
X = np.array([features_MSD[sid] for sid in song_set])
Y = np.zeros((N, K), dtype=np.bool)
for k in range(K):
pl = playlists[k]
indices = [song_indices[sid] for sid in pl]
Y[indices, k] = True
return X, Y
In [ ]:
X, Y = gen_dataset_subset(playlists=playlists_subset, song_set=song_set, features_MSD=song2Features)
# data split: approximately 80/20 for training/dev
X_train, X_dev, Y_train, Y_dev = train_test_split(X, Y, test_size=0.2, random_state=8976321)
# feature normalisation
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = 10 * np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_dev -= X_train_mean
X_dev /= X_train_std
In [ ]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev : %15s %15s' % (X_dev.shape, Y_dev.shape))
In [ ]:
print(np.mean(np.mean(X_train, axis=0)))
print(np.mean( np.std(X_train, axis=0)) - 0.1)
print(np.mean(np.mean(X_dev, axis=0)))
print(np.mean( np.std(X_dev, axis=0)) - 0.1)
Independent logistic regression.
In [ ]:
br = BinaryRelevance(n_jobs=4)
br.fit(X_train, Y_train)
In [ ]:
Y_br = br.predict(X_dev)
In [ ]:
np.mean(calc_precisionK(Y_dev.T, Y_br.T))
In [ ]:
f1_score_nowarn(Y_dev.ravel(), (Y_br>=0).ravel(), average='binary')
In [ ]:
precision_recall_fscore_support(Y_dev.ravel(), (Y_br>=0).ravel(), average='binary', warn_for=None)
In [ ]:
#np.mean(calc_F1(Y_train, br.predict(X_train) >= 0))
In [ ]:
%%script false
min_pos_score = []
for col in range(Y_dev.shape[1]):
val = Y_br[:,col][Y_dev[:,col]]
if len(val) > 0:
min_pos_score.append(np.min(val))
else:
min_pos_score.append(np.nan)
print(np.array(min_pos_score))
In [ ]:
%%script false
max_neg_score = []
for col in range(Y_dev.shape[1]):
val = Y_br[:,col][np.logical_not(Y_dev[:,col])]
if len(val) > 0:
max_neg_score.append(np.max(val))
print(np.array(max_neg_score))
In [ ]:
#print(np.array(min_pos_score)-np.array(max_neg_score))
P-Classification ~ P-norm push ranking.
In [ ]:
#pc1 = PClassificationMLC(C=1, weighting=True, verticalWeighting=True)
pc1 = PCMLC(C=1, weighting=True, verticalWeighting=True)
pc1.fit(X_train, Y_train)
In [ ]:
X_test = X_dev
Y_test = Y_dev
In [ ]:
np.sum(Y_dev, axis=0)
In [ ]:
Y_pc = pc1.predict(X_test)
In [ ]:
np.mean(calc_precisionK(Y_test.T, Y_pc.T))
In [ ]:
precision_recall_fscore_support(Y_dev.ravel(), (Y_pc>=0).ravel(), average='binary', warn_for=None)
In [ ]:
#np.mean(calc_F1(Y_dev, Y_pc >= 0))
In [ ]:
#np.mean(calc_F1(Y_train, pc.predict(X_train) >= 0))
In [ ]:
min_pos_score = []
for col in range(Y_test.shape[1]):
val = Y_pc[:,col][Y_test[:,col]]
if len(val) > 0:
min_pos_score.append(np.min(val))
else:
min_pos_score.append(np.nan)
#plt.hist((np.array(min_pos_score)))
#plt.hist((np.nan_to_num(min_pos_score)), bins=30)
#print(np.array(min_pos_score))
#print()
In [ ]:
max_neg_score = []
for col in range(Y_test.shape[1]):
val = Y_pc[:,col][np.logical_not(Y_test[:,col])]
if len(val) > 0:
max_neg_score.append(np.max(val))
#plt.hist(np.array(max_neg_score), bins=30)
#print()
In [ ]:
#plt.hist(np.nan_to_num(min_pos_score)-np.array(max_neg_score), bins=30)
#print()
In [ ]:
N, K = Y.shape
In [ ]:
#type(np.nan)
In [ ]:
Y_nan = Y.copy().astype(np.float)
np.random.seed(8967321)
rand_num = int(0.2 * N)
ones = 0
for k in range(K):
randix = np.random.permutation(np.arange(N))[:rand_num]
Y_nan[randix, k] = np.nan
ones += Y[randix, k].sum()
In [ ]:
#np.sum(Y, axis=0)
In [ ]:
#np.nansum(Y_nan, axis=0)
In [ ]:
#np.sum(Y, axis=0) - np.nansum(Y_nan, axis=0)
In [ ]:
#ones # number of positive entries selected to be masked as NaN
In [ ]:
#Y.shape
In [ ]:
#Y_nan.shape
The number of NaN entries.
In [ ]:
np.sum(np.isnan(Y_nan))
In [ ]:
#np.sum(Y)
Train: keep running util no overflow warning occurred.
In [ ]:
pc2 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc2.fit(X, Y_nan)
Prediction: use the minimum of positive entry score of the same example as threshold.
Evaluation: use F1 on all unknown entries (as a 1D array).
In [ ]:
Y_pred2 = pc2.predict(X)
In [ ]:
pos_index = np.nan_to_num(Y_nan).astype(np.bool)
nan_index = np.isnan(Y_nan)
In [ ]:
ground_truths = Y[nan_index]
In [ ]:
thresholds = []
preds = []
for k in range(K):
val = Y_pred2[:, k][pos_index[:, k]]
th = np.min(val)
thresholds.append(th)
preds += (Y_pred2[nan_index[:,k], k] >= th).tolist()
In [ ]:
f1_score_nowarn(ground_truths, preds, average='binary')
In [ ]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)
In [ ]:
N, K = Y.shape
In [ ]:
Y_nan_part = Y.copy().astype(np.float)
np.random.seed(8967321)
nan_num = int(0.4 * N)
indices = np.arange(N)[-nan_num:]
ones = 0
for k in range(int(K/2), K):
Y_nan_part[indices, k] = np.nan
ones += Y[indices, k].sum()
In [ ]:
#np.sum(Y, axis=0)
In [ ]:
#np.nansum(Y_nan_part, axis=0)
In [ ]:
#np.sum(Y, axis=0) - np.nansum(Y_nan_part, axis=0)
In [ ]:
#np.sum(np.isnan(Y_nan_part),axis=0)
In [ ]:
#ones # number of positive entries selected to be masked as NaN
In [ ]:
#Y.shape
In [ ]:
#Y_nan.shape
The number of NaN entries.
In [ ]:
np.sum(np.isnan(Y_nan_part))
In [ ]:
#np.sum(Y)
Train: keep running util no overflow warning occurred.
In [ ]:
pc3 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc3.fit(X, Y_nan_part)
Prediction: use the minimum of positive entry score of the same example as threshold.
Evaluation: use F1 on all unknown entries (as a 1D array).
In [ ]:
Y_pred3 = pc3.predict(X)
In [ ]:
pos_index = np.nan_to_num(Y_nan_part).astype(np.bool)
nan_index = np.isnan(Y_nan_part)
In [ ]:
ground_truths = Y[nan_index]
In [ ]:
thresholds = []
preds = []
for k in range(int(K/2), K):
val = Y_pred3[:, k][pos_index[:, k]]
th = np.min(val)
#th = np.mean(val)
thresholds.append(th)
preds += (Y_pred3[nan_index[:,k], k] >= th).tolist()
In [ ]:
#np.sum(ground_truths)
In [ ]:
#np.sum(preds)
In [ ]:
f1_score_nowarn(ground_truths, preds, average='binary')
In [ ]:
len(preds)
In [ ]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)
In [ ]:
N, K = Y.shape
In [ ]:
Y_nan_part2 = Y.copy().astype(np.float)
np.random.seed(8967321)
rand_num = int(0.4 * N)
ones = 0
for k in range(int(K/2), K):
randix = np.random.permutation(np.arange(N))[:rand_num]
Y_nan_part2[randix, k] = np.nan
ones += Y[randix, k].sum()
In [ ]:
#np.sum(Y, axis=0)
In [ ]:
#np.nansum(Y_nan_part, axis=0)
In [ ]:
#np.sum(Y, axis=0) - np.nansum(Y_nan_part, axis=0)
In [ ]:
#ones # number of positive entries selected to be masked as NaN
In [ ]:
#Y.shape
In [ ]:
#Y_nan.shape
The number of NaN entries.
In [ ]:
np.sum(np.isnan(Y_nan_part2))
In [ ]:
#np.sum(Y)
In [ ]:
pc4 = PClassificationMLC(weighting=True, verticalWeighting=True)
pc4.fit(X, Y_nan_part2)
Prediction: use the minimum of positive entry score of the same example as threshold.
Evaluation: use F1 on all unknown entries (as a 1D array).
In [ ]:
Y_pred4 = pc4.predict(X)
In [ ]:
pos_index = np.nan_to_num(Y_nan_part2).astype(np.bool)
nan_index = np.isnan(Y_nan_part2)
In [ ]:
ground_truths = Y[nan_index]
In [ ]:
thresholds = []
preds = []
for k in range(int(K/2), K):
val = Y_pred4[:, k][pos_index[:, k]]
th = np.min(val)
#th = np.mean(val)
thresholds.append(th)
preds += (Y_pred4[nan_index[:,k], k] >= th).tolist()
In [ ]:
#np.sum(ground_truths)
In [ ]:
#np.sum(preds)
In [ ]:
f1_score_nowarn(ground_truths, preds, average='binary')
In [ ]:
precision_recall_fscore_support(ground_truths, preds, average='binary', warn_for=None)
In [ ]:
np.sum(ground_truths)
In [ ]:
np.sum(preds)
In [ ]:
np.sum(np.logical_and(ground_truths, preds))
In [ ]: