Multi-label classification -- mulan dataset


In [ ]:
%matplotlib inline
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd

from scipy.optimize import check_grad
from scipy.special import expit as sigmoid
from scipy.sparse import issparse, csr_matrix

from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed

In [ ]:
sys.path.append('src/')
sys.path.append('src/models')
from MLC import MLC, objective, risk_pclassification, DataHelper
from tools import create_dataset, dataset_names, nLabels_dict, f1_score_nowarn

In [ ]:
dataset_names

In [ ]:
data_ix = 3

In [ ]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)

In [ ]:
data_dir = 'data'
SEED = 918273645
fmodel_base = os.path.join(data_dir, 'pc-' + dataset_name + '-base.pkl')
fmodel_prec = os.path.join(data_dir, 'pc-' + dataset_name + '-prec.pkl')
fmodel_f1 = os.path.join(data_dir, 'pc-' + dataset_name + '-f1.pkl')

Load data.


In [ ]:
X_train, Y_train = create_dataset(dataset_name, train_data=True, shuffle=True, random_state=SEED)
X_test,  Y_test  = create_dataset(dataset_name, train_data=False)

In [ ]:
Y_train = csr_matrix(Y_train)

Feature normalisation.


In [ ]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test  -= X_train_mean
X_test  /= X_train_std

In [ ]:
def print_dataset_info(X_train, Y_train, X_test, Y_test):
    N_train, D = X_train.shape
    K = Y_train.shape[1]
    N_test = X_test.shape[0]
    print('%-45s %s' % ('Number of training examples:', '{:,}'.format(N_train)))
    print('%-45s %s' % ('Number of test examples:', '{:,}'.format(N_test)))
    print('%-45s %s' % ('Number of features:', '{:,}'.format(D)))
    print('%-45s %s' % ('Number of labels:', '{:,}'.format(K)))
    avgK_train = np.mean(np.sum(Y_train, axis=1))
    avgK_test  = np.mean(np.sum(Y_test, axis=1))
    print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (train):', avgK_train, 100*avgK_train / K))
    print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (test):', avgK_test, 100*avgK_test / K))
    #print('%-45s %.4f%%' % ('Average label occurrence (train):', np.mean(np.sum(Y_train, axis=0)) / N_train))
    #print('%-45s %.4f%%' % ('Average label occurrence (test):', np.mean(np.sum(Y_test, axis=0)) / N_test))
    print('%-45s %.3f%%' % ('Sparsity (percent) (train):', 100 * np.sum(Y_train) / np.prod(Y_train.shape)))
    print('%-45s %.3f%%' % ('Sparsity (percent) (test):', 100 * np.sum(Y_test) / np.prod(Y_test.shape)))

In [ ]:
print('%-45s %s' % ('Dataset:', dataset_name))
print_dataset_info(X_train, Y_train, X_test, Y_test)

In [ ]:
def avgF1(Y_true, Y_pred):
    # thresholds
    #THs = [0, 0.05, 0.10, 0.15, 0.2, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70, 0.75]  # SPEN
    THs = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85]
    F1 = Parallel(n_jobs=-1)(delayed(f1_score_nowarn)(Y_true, Y_pred >= th, average='samples') for th in THs)
    bestix = np.argmax(F1)
    print('best threshold: %g, best F1: %g, #examples: %g' % (THs[bestix], F1[bestix], Y_true.shape[0]))
    return F1[bestix]

In [ ]:
C_set = [0.01, 0.1, 1, 10, 100, 1000]  # bibtex, bookmarks level 1
p_set = [1, 2, 3, 4, 5, 6]
parameters = [{'C1': C_set, 'p': p_set}]
scorer = {'F1': make_scorer(avgF1)}

In [ ]:
clf = GridSearchCV(MLC(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='F1')
clf.fit(X_train, Y_train, verbose=1)
#pkl.dump(clf, open(fmodel_f1, 'wb'))

In [ ]:
clf.cv_results_['mean_test_F1'].reshape(len(C_set), len(p_set))

In [ ]:
# CV results for bibtex
best_C1 = 100
best_p = 3
best_threshold = 0.5
clf = MLC(C1=best_C1, p=best_p)
clf.fit(X_train, Y_train, verbose=0, njobs=1)
clf.best_threshold = best_threshold

In [ ]:
pkl.dump(clf, open('data/mlc_%s.pkl' % dataset_name, 'wb'))

In [ ]:
clf = pkl.load(open('data/mlc_%s.pkl' % dataset_name, 'rb'))

In [ ]:
f1_score_nowarn(Y_test, clf.decision_function(X_test) >= clf.best_threshold, average='samples')

In [ ]:
f1_score_nowarn(Y_test, clf.decision_function(X_test) >= clf.best_threshold, average='macro')

In [ ]:
sys.path.append('src/tools')
from evaluate import calc_RPrecision, calc_RPrecision_HitRate

In [ ]:
rps, ind = calc_RPrecision(Y_test, clf.decision_function(X_test), axis=0)
print(np.mean(rps), len(ind), Y_test.shape[0])

In [ ]:
def calc_RP(Y_true, Y_pred):
    assert Y_true.shape == Y_pred.shape
    rps = []
    for j in range(Y_true.shape[1]):
        y_true = Y_true[:, j]
        y_pred = Y_pred[:, j]
        rp, _ = calc_RPrecision_HitRate(y_true, y_pred)
        rps.append(rp)
    return rps

In [ ]:
rps = calc_RP(Y_test.T, clf.decision_function(X_test).T)
print(np.mean(rps), len(rps), Y_test.shape[0])

In [ ]:
rps, ind = calc_RPrecision(Y_test, clf.decision_function(X_test), axis=1)
print(np.mean(rps), len(ind), Y_test.shape[1])

In [ ]:
rps = calc_RP(Y_test, clf.decision_function(X_test))
print(np.mean(rps), len(rps), Y_test.shape[1])