Multi-label classification -- instance weighting with logistic loss

%matplotlib inline
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd

from scipy.optimize import minimize
from scipy.optimize import check_grad

from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns

from evaluate import avgPrecision, avgPrecisionK, printEvaluation
from datasets import create_dataset_yeast_train, create_dataset_yeast_test, yeast_nLabels
from datasets import create_dataset_scene_train, create_dataset_scene_test, scene_nLabels
from datasets import create_dataset_mediamill_subset_train, create_dataset_mediamill_subset_test, mm_nLabels

datasets = ['yeast', 'scene', 'mediamill']
num_labels = [yeast_nLabels, scene_nLabels, mm_nLabels]
create_dataset_train_funcs = [create_dataset_yeast_train, 
create_dataset_test_funcs  = [create_dataset_yeast_test,

data_ix = 2

dataset_name = datasets[data_ix]
nLabels = num_labels[data_ix]
create_dataset_train = create_dataset_train_funcs[data_ix]
create_dataset_test  = create_dataset_test_funcs [data_ix]

SEED = 123456789

The sigmoid function.

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

Instance weighting with logistic loss

Multi-label learning with p-norm push loss.

def obj_instance_weighting(w, X, Y, C):
        Objective with L2 regularisation and instance weighting with logistic loss
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x L
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
    N, D = X.shape
    L = Y.shape[1]
    assert(w.shape[0] == L * D)
    assert(C > 0)
    W = w.reshape(L, D)  # reshape weight matrix
    J = 0.0  # cost
    G = np.zeros_like(W)  # gradient matrix
    nPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    nNegAll = L - nPosAll        # number of negative labels for each example, N by 1
    for k in range(L):
        wk = W[k, :]
        Yk = Y[:, k]
        sPosVec =[Yk == 1, :], wk)      # Nk+ by 1
        sNegVec =[Yk == 0, :], wk)      # NK- by 1
        nPosVec = nPosAll[Yk == 1]               # Nk+ by 1
        nNegVec = nNegAll[Yk == 0]               # NK- by 1
        lossPos = np.divide(np.log1p(np.exp(-sPosVec)), nPosVec)  # NK+ by 1
        lossNeg = np.divide(np.log1p(np.exp(sNegVec)), nNegVec)  # NK- by 1
        J += np.sum(lossPos) + np.sum(lossNeg)
        GradPos = -X[Yk == 1, :] * (1 / np.multiply(nPosVec, 1 + np.exp( sPosVec)))[:, None]
        GradNeg =  X[Yk == 0, :] * (1 / np.multiply(nNegVec, 1 + np.exp(-sNegVec)))[:, None]
        G[k, :] = np.sum(GradPos, axis=0) + np.sum(GradNeg, axis=0)
    #J = 0.5 * C *, w) + J / N
    #G = C * W + G / N
    # be consistent with scikit-learn C = 1 / (N * \lambda)
    #J = 0.5 *, w) + C * J
    #G = W + C * G
    J =, w) / (2.0 * C) + J / N
    G = W / C + G / N
    return (J, G.ravel())

Check gradient

X_train, Y_train = create_dataset_train()
X_test,  Y_test  = create_dataset_test()

#%%script false
C = 1  # if C is lambda
#C = 1 / X_train.shape[0]
w0 = np.random.rand(X_train.shape[1] * nLabels)
check_grad(lambda w: obj_instance_weighting(w, X_train, Y_train, C)[0], \
           lambda w: obj_instance_weighting(w, X_train, Y_train, C)[1], w0)

class MLC_instanceweight(BaseEstimator):
    """All methods are necessary for a scikit-learn estimator"""
    def __init__(self, C=1):
        assert C > 0
        self.C = C
        self.trained = False
    def fit(self, X_train, Y_train):
        """Model fitting by optimising the objective"""
        opt_method = 'BFGS' #'Newton-CG'
        options = {'disp': True}
        if options['disp']: 
            print('\nC: %g' % self.C)
        D = X_train.shape[1]
        L = Y_train.shape[1]
        w0 = np.random.rand(L * D)  # initial guess
        opt = minimize(obj_instance_weighting, w0, args=(X_train, Y_train, self.C), \
                       method=opt_method, jac=True, options=options)
        if opt.success is True:
            self.w = opt.x
            self.trained = True
            sys.stderr.write('Optimisation failed')
            self.trained = False
    def decision_function(self, X_test):
        """Make predictions (score is real number)"""
        assert self.trained is True, "Can't make prediction before training"
        D = X_test.shape[1]
        return, self.w.reshape(-1, D).T)
    def predict(self, X_test):
        """Make predictions (score is boolean)"""
        preds = self.decision_function(X_test)
        return (preds > 0)
    def score(self, X, Y):
        """Compute scoring metric"""
        allPreds = self.decision_function(X)
        return avgPrecisionK(Y, allPreds)
    # inherit from BaseEstimator instead of re-implement
    #def get_params(self, deep = True):
    #def set_params(self, **params):

parameters = [{'C': [10**(e) for e in range(-6,1)]}]

clf = GridSearchCV(MLC_instanceweight(), parameters, cv=5), Y_train)

print("\nBest parameters set found on development set:")

for mean, std, params in zip(clf.cv_results_['mean_test_score'], clf.cv_results_['std_test_score'], \
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

preds_train = clf.decision_function(X_train)
preds_test  = clf.decision_function(X_test)

print('Training set:')
printEvaluation(Y_train, preds_train)
print('Test set:')
printEvaluation(Y_test, preds_test)

Result analysis

precisions_train = [avgPrecision(Y_train, preds_train, k) for k in range(1, nLabels+1)]
precisions_test  = [avgPrecision(Y_test,  preds_test,  k) for k in range(1, nLabels+1)]

precisionK_train = avgPrecisionK(Y_train, preds_train)
precisionK_test  = avgPrecisionK(Y_test,  preds_test)

plt.plot(precisions_train, ls='--', c='r', label='Train')
plt.plot(precisions_test,  ls='-',  c='g', label='Test')
plt.plot([precisionK_train for k in range(nLabels)], ls='-', c='r', label='Train, Precision@K')
plt.plot([precisionK_test  for k in range(nLabels)], ls='-', c='g', label='Test, Precision@K')
plt.xticks(np.arange(nLabels), np.arange(1,nLabels+1))
plt.title('Instance Weighting w. Logistic Loss on ' + dataset_name + ' dataset')
plt.savefig(dataset_name + '_iw.svg')