In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, time, gzip
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from scipy.sparse import lil_matrix, issparse
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
from models import PCMLC, obj_pclassification
In [ ]:
data_dir = 'data/aotm-2011/setting2'
X_train = pkl.load(gzip.open(os.path.join(data_dir, 'X_train.pkl.gz'), 'rb'))
Y_train = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train.pkl.gz'), 'rb'))
Y_train_dev = pkl.load(gzip.open(os.path.join(data_dir, 'Y_train_dev.pkl.gz'), 'rb'))
PU_dev = pkl.load(gzip.open(os.path.join(data_dir, 'PU_dev.pkl.gz'), 'rb'))
PU_test = pkl.load(gzip.open(os.path.join(data_dir, 'PU_test.pkl.gz'), 'rb'))
cliques = pkl.load(gzip.open(os.path.join(data_dir, 'cliques_train_dev.pkl.gz'), 'rb'))
In [ ]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev : %15s %15s' % PU_dev.shape)
print('Test : %15s %15s' % PU_test.shape)
In [ ]:
clf = PCMLC(C1=1, C2=1, C3=0.001, p=1, loss_type='label')
clf.fit_minibatch_pla(X_train, Y_train, PUMat=PU_dev, user_playlist_indices=cliques,
batch_size=512, n_epochs=2, verbose=1)
In [ ]:
plt.figure(figsize=[20, 3])
plt.plot(clf.cost)
In [ ]:
Y_dev = Y_train_dev[:, -PU_dev.shape[1]:]
offset = Y_train_dev.shape[1] - PU_dev.shape[1]
W = clf.W
b = clf.b
print(clf)
aucs = []
for j in range(Y_dev.shape[1]):
if (j+1) % 10 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_dev.shape[1]))
sys.stdout.flush()
y1 = Y_dev[:, j].toarray().reshape(-1)
y2 = PU_dev[:, j].toarray().reshape(-1)
indices = np.where(0 == y2)[0]
#print(indices); break
y_true = y1[indices]
assert y_true.sum() + PU_dev[:, j].sum() == Y_dev[:, j].sum()
wj = W[j + offset, :].reshape(-1)
y_pred = (np.dot(X_train, wj) + b)[indices]
aucs.append(roc_auc_score(y_true, y_pred))
print('\n%.4f, %d / %d' % (np.mean(aucs), len(aucs), Y_dev.shape[1]))
In [ ]:
data_dir1 = 'data/aotm-2011/setting1'
X_train1 = pkl.load(gzip.open(os.path.join(data_dir1, 'X_train.pkl.gz'), 'rb'))
Y_train1 = pkl.load(gzip.open(os.path.join(data_dir1, 'Y_train.pkl.gz'), 'rb'))
X_dev1 = pkl.load(gzip.open(os.path.join(data_dir1, 'X_dev.pkl.gz'), 'rb'))
Y_dev1 = pkl.load(gzip.open(os.path.join(data_dir1, 'Y_dev.pkl.gz'), 'rb'))
cliques1 = pkl.load(gzip.open(os.path.join(data_dir1, 'cliques_all.pkl.gz'), 'rb'))
In [ ]:
print('Train: %15s %15s' % (X_train1.shape, Y_train1.shape))
print('Dev : %15s %15s' % (X_dev1.shape, Y_dev1.shape))
In [ ]:
clf1 = PCMLC(loss_type='label')
clf1.fit_minibatch_mlr(X_train1, Y_train1, user_playlist_indices=None, batch_size=512, n_epochs=2, verbose=1)
In [ ]:
plt.figure(figsize=[20, 3])
plt.plot(clf1.cost[50:])
In [ ]:
W = clf1.W
b = clf1.b
print(clf1)
aucs1 = []
npos = np.asarray(Y_dev1.sum(axis=0)).reshape(-1)
assert len(npos) == Y_dev1.shape[1]
for j in range(Y_dev1.shape[1]):
if (j+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (j+1, Y_dev1.shape[1]))
sys.stdout.flush()
if npos[j] < 1: continue
y_true = Y_dev1[:, j].toarray().reshape(-1)
wj = W[j, :].reshape(-1)
y_pred = np.dot(X_dev1, wj) + b
aucs1.append(roc_auc_score(y_true, y_pred))
print('\n%.4f, %d / %d' % (np.mean(aucs1), len(aucs1), Y_dev1.shape[1]))