Playlist augmentation


In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys, time, gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from scipy.sparse import lil_matrix, issparse

import matplotlib.pyplot as plt
import seaborn as sns

In [ ]:
from models import PCMLC, obj_pclassification

In [ ]:
data_dir = 'data/aotm-2011/setting2'
X_train = pkl.load(gzip.open(os.path.join(data_dir, 'X_train.pkl.gz'), 'rb'))
Y_train = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train.pkl.gz'), 'rb'))
Y_train_dev = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train_dev.pkl.gz'), 'rb'))
PU_dev = pkl.load(gzip.open(os.path.join(data_dir, 'PU_dev.pkl.gz'), 'rb'))
PU_test = pkl.load(gzip.open(os.path.join(data_dir, 'PU_test.pkl.gz'), 'rb'))
cliques = pkl.load(gzip.open(os.path.join(data_dir, 'cliques_train_dev.pkl.gz'), 'rb'))

In [ ]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % PU_dev.shape)
print('Test : %15s %15s' % PU_test.shape)

PLA


In [ ]:
clf = PCMLC(C1=1, C2=1, C3=0.001, p=1,  loss_type='label')
clf.fit_minibatch_pla(X_train, Y_train, PUMat=PU_dev, user_playlist_indices=cliques, 
                      batch_size=512, n_epochs=2, verbose=1)

In [ ]:
plt.figure(figsize=[20, 3])
plt.plot(clf.cost)

In [ ]:
Y_dev = Y_train_dev[:, -PU_dev.shape[1]:]
offset = Y_train_dev.shape[1] - PU_dev.shape[1]
W = clf.W
b = clf.b
print(clf)
aucs = []
for j in range(Y_dev.shape[1]):
    if (j+1) % 10 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_dev.shape[1]))
        sys.stdout.flush()
    y1 = Y_dev[:, j].toarray().reshape(-1)
    y2 = PU_dev[:, j].toarray().reshape(-1)
    indices = np.where(0 == y2)[0]
    #print(indices); break
    y_true = y1[indices]
    assert y_true.sum() + PU_dev[:, j].sum() == Y_dev[:, j].sum()
    wj = W[j + offset, :].reshape(-1)
    y_pred = (np.dot(X_train, wj) + b)[indices]
    aucs.append(roc_auc_score(y_true, y_pred))
print('\n%.4f, %d / %d' % (np.mean(aucs), len(aucs), Y_dev.shape[1]))

MLR


In [ ]:
data_dir1 = 'data/aotm-2011/setting1'
X_train1 = pkl.load(gzip.open(os.path.join(data_dir1, 'X_train.pkl.gz'), 'rb'))
Y_train1 = pkl.load(gzip.open(os.path.join(data_dir1, 'Y_train.pkl.gz'), 'rb'))
X_dev1 = pkl.load(gzip.open(os.path.join(data_dir1, 'X_dev.pkl.gz'), 'rb'))
Y_dev1 = pkl.load(gzip.open(os.path.join(data_dir1, 'Y_dev.pkl.gz'), 'rb'))
cliques1 = pkl.load(gzip.open(os.path.join(data_dir1, 'cliques_all.pkl.gz'), 'rb'))

In [ ]:
print('Train: %15s %15s' % (X_train1.shape, Y_train1.shape))
print('Dev  : %15s %15s' % (X_dev1.shape, Y_dev1.shape))

In [ ]:
clf1 = PCMLC(loss_type='label')
clf1.fit_minibatch_mlr(X_train1, Y_train1, user_playlist_indices=None, batch_size=512, n_epochs=2, verbose=1)

In [ ]:
plt.figure(figsize=[20, 3])
plt.plot(clf1.cost[50:])

In [ ]:
W = clf1.W
b = clf1.b
print(clf1)
aucs1 = []
npos = np.asarray(Y_dev1.sum(axis=0)).reshape(-1)
assert len(npos) == Y_dev1.shape[1]
for j in range(Y_dev1.shape[1]):
    if (j+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (j+1, Y_dev1.shape[1]))
        sys.stdout.flush()
    if npos[j] < 1: continue
    y_true = Y_dev1[:, j].toarray().reshape(-1)
    wj = W[j, :].reshape(-1)
    y_pred = np.dot(X_dev1, wj) + b
    aucs1.append(roc_auc_score(y_true, y_pred))
print('\n%.4f, %d / %d' % (np.mean(aucs1), len(aucs1), Y_dev1.shape[1]))