Evaluate Multi-label Classification


In [1]:
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import torchfile

In [2]:
sys.path.append('src')
from tools import f1_score_nowarn, calc_F1, calc_precisionK
from tools import create_dataset, dataset_names, nLabels_dict
#from models import BinaryRelevance
sys.path.append('src/models')
from BinaryRelevance import BinaryRelevance
from PC import MLC_pclassification, obj_pclassification, avgF1
from tools import calc_RPrecision_HitRate

In [3]:
dataset_names


Out[3]:
['yeast', 'scene', 'bibtex', 'bookmarks']

In [4]:
data_ix = 3

In [5]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)


bookmarks 208

In [6]:
data_dir = 'data'

Load dataset


In [7]:
X_train, Y_train = create_dataset(dataset_name, train_data=True)
X_test,  Y_test  = create_dataset(dataset_name, train_data=False)

Feature normalisation.


In [8]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test  -= X_train_mean
X_test  /= X_train_std

Save performance data.


In [9]:
def dump_perf(fname, perf_dict):
    if os.path.exists(fname):
        _dict = pkl.load(open(fname, 'rb'))
        if dataset_name not in _dict:
            _dict[dataset_name] = perf_dict
        else:
            _dict[dataset_name].update(perf_dict)
    else:
        _dict = {dataset_name: perf_dict}
    pkl.dump(_dict, open(fname, 'wb'))

In [10]:
def calc_RP(Y_true, Y_pred):
    assert Y_true.shape == Y_pred.shape
    rps = []
    for j in range(Y_true.shape[1]):
        y_true = Y_true[:, j]
        y_pred = Y_pred[:, j]
        rp, _ = calc_RPrecision_HitRate(y_true, y_pred)
        rps.append(rp)
    return rps

Evaluate DVN


In [11]:
preds_dvn = np.load(os.path.join(data_dir, 'result_mlc/%s/preds_test_dvn.npy' % dataset_name))

In [12]:
preds_dvn.shape


Out[12]:
(27856, 208)

In [13]:
Y_test.shape


Out[13]:
(27856, 208)

In [14]:
f1mean = f1_score_nowarn(Y_test.astype(np.bool), preds_dvn >= 0.5, average='samples')
print(f1mean)


0.37155596860541956

In [15]:
F1_example = np.mean(calc_F1(Y_test.astype(np.bool), preds_dvn >= 0.5))
print(F1_example)


0.37155596860541956

In [16]:
F1_label = f1_score_nowarn(Y_test.astype(np.bool), preds_dvn >= 0.5, average='macro')
print(F1_label)


0.23674811498350729

In [17]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), preds_dvn)
avgPak = np.mean(pak)
print(avgPak)


0.42245008358248076

In [18]:
#auc_dvn = roc_auc_score(Y_test, preds_dvn, average='samples')
#print(auc_dvn)

In [19]:
rps = calc_RP(Y_test, preds_dvn)
avgRP = np.mean(rps)
print(avgRP)


0.26304615692760236

In [20]:
perf_dict_test = {'F1_example': F1_example,
                  'F1_label':  F1_label,
                  'RP_example': avgPak,
                  'RP_label': avgRP,
                 }

In [21]:
fname = os.path.join(data_dir, 'result_mlc/perf-dvn.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))


{'bibtex': {'Test': {'F1_example': 0.44700475542993195, 'F1_label': 0.32421320747927107, 'RP_example': 0.5032464367387184, 'RP_label': 0.37660642302161584}}, 'bookmarks': {'Test': {'F1_example': 0.37155596860541956, 'F1_label': 0.23674811498350729, 'RP_example': 0.42245008358248076, 'RP_label': 0.26304615692760236}}}

Evaluate SPEN

Pick the one with best sample F1.


In [22]:
preds_spen = torchfile.load(os.path.join(data_dir, 'result_mlc/%s/preds_test_spen.torch' % dataset_name))

In [23]:
preds_spen.shape


Out[23]:
(27856, 208)

In [24]:
Y_test.shape


Out[24]:
(27856, 208)

In [25]:
thresholds = [0, 0.05, 0.10, 0.15, 0.2, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70, 0.75]  # SPEN

In [26]:
F1_all_example = []
F1_all_label  = []
for th in thresholds:
    F1_all_example.append(f1_score_nowarn(Y_test, preds_spen >= th, average='samples'))
    F1_all_label.append(f1_score_nowarn(Y_test, preds_spen >= th, average='macro'))

In [27]:
bestix = np.argmax(F1_all_example)
print(F1_all_example[bestix], F1_all_label[bestix], thresholds[bestix])


0.3553938453788133 0.24094370084927494 0.1

In [28]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), preds_spen)
avgPak = np.mean(pak)
print(avgPak)


0.3958859200493974

In [29]:
#auc_spen = roc_auc_score(Y_test, preds_spen, average='samples')
#print(auc_spen)

In [30]:
rps = calc_RP(Y_test, preds_spen)
avgRP = np.mean(rps)
print(avgRP)


0.24929105401044138

In [31]:
perf_dict_test = {'F1_example': F1_all_example[bestix],
                  'F1_label': F1_all_label[bestix],
                  'RP_example': avgPak,
                  'RP_label': avgRP,
                 }

In [32]:
fname = os.path.join(data_dir, 'result_mlc/perf-spen.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))


{'bibtex': {'Test': {'F1_example': 0.41324037236538086, 'F1_label': 0.3365843859990467, 'RP_example': 0.45575463430655805, 'RP_label': 0.34392442428479153}}, 'bookmarks': {'Test': {'F1_example': 0.3553938453788133, 'F1_label': 0.24094370084927494, 'RP_example': 0.3958859200493974, 'RP_label': 0.24929105401044138}}}

Evaluate Binary Relevance

Independent Logistic Regression.


In [33]:
fname = os.path.join(data_dir, 'result_mlc/%s/br-%s-base.pkl' % (dataset_name, dataset_name))
br = pkl.load(open(fname, 'rb'))

In [34]:
preds_br = br.decision_function(X_test)

In [35]:
F1_example = np.mean(f1_score_nowarn(Y_test, preds_br >= 0, average='samples'))
print(F1_example)


0.29520247742393496

In [36]:
F1_label = np.mean(f1_score_nowarn(Y_test, preds_br >= 0, average='macro'))
print(F1_label)


0.2100303975414152

In [37]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), preds_br)
avgPak = np.mean(pak)
print(avgPak)


0.35600940591394675

In [38]:
#auc_br = roc_auc_score(Y_test, preds_br, average='samples')
#print(auc_br)

In [39]:
rps = calc_RP(Y_test, preds_br)
avgRP = np.mean(rps)
print(avgRP)


0.21179771743268688

In [40]:
perf_dict_test = {'F1_example': F1_example,
                  'F1_label': F1_label,
                  'RP_example': avgPak,
                  'RP_label': avgRP,
                 }

In [41]:
fname = os.path.join(data_dir, 'result_mlc/perf-br.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))


{'bibtex': {'Test': {'F1_example': 0.378698648305404, 'F1_label': 0.3009792869576707, 'RP_example': 0.4314404356931547, 'RP_label': 0.32090306990116685}}, 'bookmarks': {'Test': {'F1_example': 0.29520247742393496, 'F1_label': 0.2100303975414152, 'RP_example': 0.35600940591394675, 'RP_label': 0.21179771743268688}}}

Evaluate P-Classification


In [42]:
fname = os.path.join(data_dir, 'result_mlc/%s/pc-%s-f1.pkl' % (dataset_name, dataset_name))
pc = pkl.load(open(fname, 'rb'))

In [43]:
preds_pc = pc.decision_function(X_test)

In [44]:
F1_example = np.mean(f1_score_nowarn(Y_test, preds_pc >= pc.best_threshold, average='samples'))
print(F1_example)


0.37660396924850337

In [45]:
F1_label = np.mean(f1_score_nowarn(Y_test, preds_pc >= pc.best_threshold, average='macro'))
print(F1_label)


0.28356730616129366

In [46]:
pak, ind = calc_precisionK(Y_test.astype(np.bool), preds_pc)
avgPak = np.mean(pak)
print(avgPak)


0.4225622340282832

In [47]:
#auc_pc = roc_auc_score(Y_test, preds_pc, average='samples')
#print(auc_pc)

In [48]:
rps = calc_RP(Y_test, preds_pc)
avgRP = np.mean(rps)
print(avgRP)


0.29531203991602994

In [49]:
perf_dict_test = {'F1_example': F1_example,
                  'F1_label': F1_label,
                  'RP_example': avgPak,
                  'RP_label': avgRP,
                 }

In [50]:
fname = os.path.join(data_dir, 'result_mlc/perf-pc.pkl')
dump_perf(fname, {'Test': perf_dict_test})
print(pkl.load(open(fname, 'rb')))


{'bibtex': {'Test': {'F1_example': 0.47015144430032674, 'F1_label': 0.3877979664536378, 'RP_example': 0.5133180719214516, 'RP_label': 0.4054764234814962}}, 'bookmarks': {'Test': {'F1_example': 0.37660396924850337, 'F1_label': 0.28356730616129366, 'RP_example': 0.4225622340282832, 'RP_label': 0.29531203991602994}}}

Results for PRLR

Results of PRLR algorithm from Multi-Label Learning with Posterior Regularization.


In [51]:
fname = os.path.join(data_dir, 'result_mlc/perf-prlr.pkl')

In [52]:
perf_dict = {
    'bibtex': {'Test': {'F1_example': 0.442, 'F1_label': 0.372,'RP_example': np.nan, 'RP_label': np.nan}}, 
    'bookmarks': {'Test': {'F1_example': 0.349, 'F1_label': 0.230,'RP_example': np.nan,'RP_label': np.nan}},}

In [53]:
pkl.dump(perf_dict, open(fname, 'wb'))
print(pkl.load(open(fname, 'rb')))


{'bibtex': {'Test': {'F1_example': 0.442, 'F1_label': 0.372, 'RP_example': nan, 'RP_label': nan}}, 'bookmarks': {'Test': {'F1_example': 0.349, 'F1_label': 0.23, 'RP_example': nan, 'RP_label': nan}}}

Generate results table


In [61]:
algos = [('br', 'BR~\cite{tsoumakas2006multi}'), 
         ('prlr', 'PRLR~\cite{lin2014multi}'), 
         ('spen', 'SPEN~\cite{belanger2016structured}'), 
         ('dvn', 'DVN~\cite{gygli2017deep}'), 
         ('pc', 'PC (Ours)')]
dataset = [('bibtex', '\textbf{bibtex}'), ('bookmarks', '\textbf{bookmarks}')]
metrics = [('F1_example', 'F$_{1\,\text{example}}$'), ('F1_label', 'F$_{1\,\text{label}}$'), 
           ('RP_example', 'R-Precision$_{\,\text{example}}$'), 
           ('RP_label', 'R-Precision$_{\,\text{label}}$')]

In [62]:
fperf = [os.path.join(data_dir, 'result_mlc/perf-%s.pkl' % algo) for algo in [t[0] for t in algos]]
perfs = [pkl.load(open(f, 'rb')) for f in fperf]

In [63]:
rows = [t[1] for t in algos]
cols = pd.MultiIndex.from_product([[t[1] for t in dataset], [t[1] for t in metrics]])

In [64]:
df_test = pd.DataFrame(index=rows, columns=cols)

In [65]:
for ix in range(len(perfs)):
    perf = perfs[ix]
    row = rows[ix]
    for jx in range(len(dataset)):
        dat = dataset[jx][0]
        dat_jx = dataset[jx][1]
        for kx in range(len(metrics)):
            metric = metrics[kx][0]
            metric_kx = metrics[kx][1]
            df_test.loc[row, (dat_jx, metric_kx)] = 100 * perf[dat]['Test'][metric]

In [66]:
df_test


Out[66]:
\textbf{bibtex} \textbf{bookmarks}
F$_{1\,\text{example}}$ F$_{1\,\text{label}}$ R-Precision$_{\,\text{example}}$ R-Precision$_{\,\text{label}}$ F$_{1\,\text{example}}$ F$_{1\,\text{label}}$ R-Precision$_{\,\text{example}}$ R-Precision$_{\,\text{label}}$
BR~\cite{tsoumakas2006multi} 37.8699 30.0979 43.144 32.0903 29.5202 21.003 35.6009 21.1798
PRLR~\cite{lin2014multi} 44.2 37.2 NaN NaN 34.9 23 NaN NaN
SPEN~\cite{belanger2016structured} 41.324 33.6584 45.5755 34.3924 35.5394 24.0944 39.5886 24.9291
DVN~\cite{gygli2017deep} 44.7005 32.4213 50.3246 37.6606 37.1556 23.6748 42.245 26.3046
PC (Ours) 47.0151 38.7798 51.3318 40.5476 37.6604 28.3567 42.2562 29.5312

In [79]:
tab_test = df_test.to_latex(float_format=lambda x: '$%.1f$' % x, na_rep='N/A', 
                            column_format='l|*{%d}{r}|*{%d}{r}' % (len(metrics), len(metrics)), 
                            multicolumn=True, multicolumn_format='c', escape=False)

In [80]:
print('\\begin{table}[!h]')
print('\centering')
print('\\caption{Performance on multi-label dataset}')
print('\\label{tab:perf_mlc}')    
print(tab_test)
print('\\end{table}')


\begin{table}[!h]
\centering
\caption{Performance on multi-label dataset}
\label{tab:perf_mlc}
\begin{tabular}{l|*{4}{r}|*{4}{r}}
\toprule
{} & \multicolumn{4}{c}{\textbf{bibtex}} & \multicolumn{4}{c}{\textbf{bookmarks}} \\
{} & F$_{1\,\text{example}}$ & F$_{1\,\text{label}}$ & R-Precision$_{\,\text{example}}$ & R-Precision$_{\,\text{label}}$ & F$_{1\,\text{example}}$ & F$_{1\,\text{label}}$ & R-Precision$_{\,\text{example}}$ & R-Precision$_{\,\text{label}}$ \\
\midrule
BR~\cite{tsoumakas2006multi}       &                  $37.9$ &                $30.1$ &                           $43.1$ &                         $32.1$ &                  $29.5$ &                $21.0$ &                           $35.6$ &                         $21.2$ \\
PRLR~\cite{lin2014multi}           &                  $44.2$ &                $37.2$ &                              N/A &                            N/A &                  $34.9$ &                $23.0$ &                              N/A &                            N/A \\
SPEN~\cite{belanger2016structured} &                  $41.3$ &                $33.7$ &                           $45.6$ &                         $34.4$ &                  $35.5$ &                $24.1$ &                           $39.6$ &                         $24.9$ \\
DVN~\cite{gygli2017deep}           &                  $44.7$ &                $32.4$ &                           $50.3$ &                         $37.7$ &                  $37.2$ &                $23.7$ &                           $42.2$ &                         $26.3$ \\
PC (Ours)                          &                  $47.0$ &                $38.8$ &                           $51.3$ &                         $40.5$ &                  $37.7$ &                $28.4$ &                           $42.3$ &                         $29.5$ \\
\bottomrule
\end{tabular}

\end{table}