In [1]:
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload
%autoreload 2
import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import torchfile
In [2]:
sys.path.append('src')
from tools import f1_score_nowarn, calc_F1, calc_precisionK
from tools import create_dataset, dataset_names, nLabels_dict
#from models import BinaryRelevance
sys.path.append('src/models')
from BinaryRelevance import BinaryRelevance
from PC import MLC_pclassification, obj_pclassification, avgF1
from tools import calc_RPrecision_HitRate
In [3]:
dataset_names
Out[3]:
In [4]:
data_ix = 3
In [5]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)
In [6]:
data_dir = 'data'
In [7]:
X_train, Y_train = create_dataset(dataset_name, train_data=True)
X_test, Y_test = create_dataset(dataset_name, train_data=False)
Feature normalisation.
In [8]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test -= X_train_mean
X_test /= X_train_std
Save performance data.
In [9]:
def dump_perf(fname, perf_dict):
if os.path.exists(fname):
_dict = pkl.load(open(fname, 'rb'))
if dataset_name not in _dict:
_dict[dataset_name] = perf_dict
else:
_dict[dataset_name].update(perf_dict)
else:
_dict = {dataset_name: perf_dict}
pkl.dump(_dict, open(fname, 'wb'))
In [10]:
def calc_RP(Y_true, Y_pred):
assert Y_true.shape == Y_pred.shape
rps = []
for j in range(Y_true.shape[1]):
y_true = Y_true[:, j]
y_pred = Y_pred[:, j]
rp, _ = calc_RPrecision_HitRate(y_true, y_pred)
rps.append(rp)
return rps
In [11]:
preds_dvn = np.load(os.path.join(data_dir, 'result_mlc/%s/preds_test_dvn.npy' % dataset_name))
In [12]:
preds_dvn.shape
Out[12]:
In [13]:
Y_test.shape
Out[13]:
In [14]:
f1mean = f1_score_nowarn(Y_test.astype(np.bool), preds_dvn >= 0.5, average='samples')
print(f1mean)
In [15]:
F1_example = np.mean(calc_F1(Y_test.astype(np.bool), preds_dvn >= 0.5))
print(F1_example)
In [16]:
F1_label = f1_score_nowarn(Y_test.astype(np.bool), preds_dvn >= 0.5, average='macro')
print(F1_label)
In [17]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), preds_dvn)
avgPak = np.mean(pak)
print(avgPak)
In [18]:
#auc_dvn = roc_auc_score(Y_test, preds_dvn, average='samples')
#print(auc_dvn)
In [19]:
rps = calc_RP(Y_test, preds_dvn)
avgRP = np.mean(rps)
print(avgRP)
In [20]:
perf_dict_test = {'F1_example': F1_example,
'F1_label': F1_label,
'RP_example': avgPak,
'RP_label': avgRP,
}
In [21]:
fname = os.path.join(data_dir, 'result_mlc/perf-dvn.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))
Pick the one with best sample F1.
In [22]:
preds_spen = torchfile.load(os.path.join(data_dir, 'result_mlc/%s/preds_test_spen.torch' % dataset_name))
In [23]:
preds_spen.shape
Out[23]:
In [24]:
Y_test.shape
Out[24]:
In [25]:
thresholds = [0, 0.05, 0.10, 0.15, 0.2, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70, 0.75] # SPEN
In [26]:
F1_all_example = []
F1_all_label = []
for th in thresholds:
F1_all_example.append(f1_score_nowarn(Y_test, preds_spen >= th, average='samples'))
F1_all_label.append(f1_score_nowarn(Y_test, preds_spen >= th, average='macro'))
In [27]:
bestix = np.argmax(F1_all_example)
print(F1_all_example[bestix], F1_all_label[bestix], thresholds[bestix])
In [28]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), preds_spen)
avgPak = np.mean(pak)
print(avgPak)
In [29]:
#auc_spen = roc_auc_score(Y_test, preds_spen, average='samples')
#print(auc_spen)
In [30]:
rps = calc_RP(Y_test, preds_spen)
avgRP = np.mean(rps)
print(avgRP)
In [31]:
perf_dict_test = {'F1_example': F1_all_example[bestix],
'F1_label': F1_all_label[bestix],
'RP_example': avgPak,
'RP_label': avgRP,
}
In [32]:
fname = os.path.join(data_dir, 'result_mlc/perf-spen.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))
Independent Logistic Regression.
In [33]:
fname = os.path.join(data_dir, 'result_mlc/%s/br-%s-base.pkl' % (dataset_name, dataset_name))
br = pkl.load(open(fname, 'rb'))
In [34]:
preds_br = br.decision_function(X_test)
In [35]:
F1_example = np.mean(f1_score_nowarn(Y_test, preds_br >= 0, average='samples'))
print(F1_example)
In [36]:
F1_label = np.mean(f1_score_nowarn(Y_test, preds_br >= 0, average='macro'))
print(F1_label)
In [37]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), preds_br)
avgPak = np.mean(pak)
print(avgPak)
In [38]:
#auc_br = roc_auc_score(Y_test, preds_br, average='samples')
#print(auc_br)
In [39]:
rps = calc_RP(Y_test, preds_br)
avgRP = np.mean(rps)
print(avgRP)
In [40]:
perf_dict_test = {'F1_example': F1_example,
'F1_label': F1_label,
'RP_example': avgPak,
'RP_label': avgRP,
}
In [41]:
fname = os.path.join(data_dir, 'result_mlc/perf-br.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))
In [42]:
fname = os.path.join(data_dir, 'result_mlc/%s/pc-%s-f1.pkl' % (dataset_name, dataset_name))
pc = pkl.load(open(fname, 'rb'))
In [43]:
preds_pc = pc.decision_function(X_test)
In [44]:
F1_example = np.mean(f1_score_nowarn(Y_test, preds_pc >= pc.best_threshold, average='samples'))
print(F1_example)
In [45]:
F1_label = np.mean(f1_score_nowarn(Y_test, preds_pc >= pc.best_threshold, average='macro'))
print(F1_label)
In [46]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), preds_pc)
avgPak = np.mean(pak)
print(avgPak)
In [47]:
#auc_pc = roc_auc_score(Y_test, preds_pc, average='samples')
#print(auc_pc)
In [48]:
rps = calc_RP(Y_test, preds_pc)
avgRP = np.mean(rps)
print(avgRP)
In [49]:
perf_dict_test = {'F1_example': F1_example,
'F1_label': F1_label,
'RP_example': avgPak,
'RP_label': avgRP,
}
In [50]:
fname = os.path.join(data_dir, 'result_mlc/perf-pc.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))
Results of PRLR algorithm from Multi-Label Learning with Posterior Regularization.
In [51]:
fname = os.path.join(data_dir, 'result_mlc/perf-prlr.pkl')
In [52]:
perf_dict = {
'bibtex': {'Test': {'F1_example': 0.442, 'F1_label': 0.372,'RP_example': np.nan, 'RP_label': np.nan}},
'bookmarks': {'Test': {'F1_example': 0.349, 'F1_label': 0.230,'RP_example': np.nan,'RP_label': np.nan}},}
In [53]:
pkl.dump(perf_dict, open(fname, 'wb'))
print(pkl.load(open(fname, 'rb')))
In [61]:
algos = [('br', 'BR~\cite{tsoumakas2006multi}'),
('prlr', 'PRLR~\cite{lin2014multi}'),
('spen', 'SPEN~\cite{belanger2016structured}'),
('dvn', 'DVN~\cite{gygli2017deep}'),
('pc', 'PC (Ours)')]
dataset = [('bibtex', '\textbf{bibtex}'), ('bookmarks', '\textbf{bookmarks}')]
metrics = [('F1_example', 'F$_{1\,\text{example}}$'), ('F1_label', 'F$_{1\,\text{label}}$'),
('RP_example', 'R-Precision$_{\,\text{example}}$'),
('RP_label', 'R-Precision$_{\,\text{label}}$')]
In [62]:
fperf = [os.path.join(data_dir, 'result_mlc/perf-%s.pkl' % algo) for algo in [t[0] for t in algos]]
perfs = [pkl.load(open(f, 'rb')) for f in fperf]
In [63]:
rows = [t[1] for t in algos]
cols = pd.MultiIndex.from_product([[t[1] for t in dataset], [t[1] for t in metrics]])
In [64]:
df_test = pd.DataFrame(index=rows, columns=cols)
In [65]:
for ix in range(len(perfs)):
perf = perfs[ix]
row = rows[ix]
for jx in range(len(dataset)):
dat = dataset[jx][0]
dat_jx = dataset[jx][1]
for kx in range(len(metrics)):
metric = metrics[kx][0]
metric_kx = metrics[kx][1]
df_test.loc[row, (dat_jx, metric_kx)] = 100 * perf[dat]['Test'][metric]
In [66]:
df_test
Out[66]:
In [79]:
tab_test = df_test.to_latex(float_format=lambda x: '$%.1f$' % x, na_rep='N/A',
column_format='l|*{%d}{r}|*{%d}{r}' % (len(metrics), len(metrics)),
multicolumn=True, multicolumn_format='c', escape=False)
In [80]:
print('\\begin{table}[!h]')
print('\centering')
print('\\caption{Performance on multi-label dataset}')
print('\\label{tab:perf_mlc}')
print(tab_test)
print('\\end{table}')