Multi-label classification -- p-classification loss



In [1]:

    
%matplotlib inline
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd

from scipy.optimize import minimize
from scipy.optimize import check_grad
from scipy.special import expit as sigmoid

from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, make_scorer, label_ranking_loss

import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed



In [14]:

    
sys.path.append('src')
from evaluate import avgPrecisionK, evaluatePrecision, evaluateF1, evaluateRankingLoss, f1_score_nowarn, calcLoss
from datasets import create_dataset, dataset_names, nLabels_dict
from PC import MLC_pclassification, obj_pclassification



In [3]:

    
data_ix = 2



In [4]:

    
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)









    



bibtex 159



In [5]:

    
data_dir = 'data'
SEED = 918273645
fmodel_base = os.path.join(data_dir, 'pc-' + dataset_name + '-base.pkl')
fmodel_prec = os.path.join(data_dir, 'pc-' + dataset_name + '-prec.pkl')
fmodel_f1 = os.path.join(data_dir, 'pc-' + dataset_name + '-f1.pkl')

Load data.



In [6]:

    
X_train, Y_train = create_dataset(dataset_name, train_data=True, shuffle=True, random_state=SEED)
X_test,  Y_test  = create_dataset(dataset_name, train_data=False)

Feature normalisation.



In [7]:

    
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test  -= X_train_mean
X_test  /= X_train_std



In [8]:

    
def print_dataset_info(X_train, Y_train, X_test, Y_test):
    N_train, D = X_train.shape
    K = Y_train.shape[1]
    N_test = X_test.shape[0]
    print('%-45s %s' % ('Number of training examples:', '{:,}'.format(N_train)))
    print('%-45s %s' % ('Number of test examples:', '{:,}'.format(N_test)))
    print('%-45s %s' % ('Number of features:', '{:,}'.format(D)))
    print('%-45s %s' % ('Number of labels:', '{:,}'.format(K)))
    avgK_train = np.mean(np.sum(Y_train, axis=1))
    avgK_test  = np.mean(np.sum(Y_test, axis=1))
    print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (train):', avgK_train, 100*avgK_train / K))
    print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (test):', avgK_test, 100*avgK_test / K))
    #print('%-45s %.4f%%' % ('Average label occurrence (train):', np.mean(np.sum(Y_train, axis=0)) / N_train))
    #print('%-45s %.4f%%' % ('Average label occurrence (test):', np.mean(np.sum(Y_test, axis=0)) / N_test))
    print('%-45s %.3f%%' % ('Sparsity (percent) (train):', 100 * np.sum(Y_train) / np.prod(Y_train.shape)))
    print('%-45s %.3f%%' % ('Sparsity (percent) (test):', 100 * np.sum(Y_test) / np.prod(Y_test.shape)))



In [9]:

    
print('%-45s %s' % ('Dataset:', dataset_name))
print_dataset_info(X_train, Y_train, X_test, Y_test)









    



Dataset:                                      bibtex
Number of training examples:                  4,880
Number of test examples:                      2,515
Number of features:                           1,836
Number of labels:                             159
Average number of positive labels (train):    2.380 (1.50%)
Average number of positive labels (test):     2.444 (1.54%)
Sparsity (percent) (train):                   1.497%
Sparsity (percent) (test):                    1.537%

p-classification loss

Multi-label learning with p-classification loss.



In [10]:

    
def obj_pc_latent(mu, Phi, Y, C, p, weighting=True):
    """
        Objective with L2 regularisation and p-classification loss
        
        Input:
            - mu: current latent features, flattened K x D
            - Phi: label embeddings, K x D
            - Y: label matrix, N x K
            - C: regularisation constant, C = 1 / \lambda
            - p: constant for p-classification loss
    """
    K, D = Phi.shape
    N = Y.shape[0]
    assert(mu.shape[0] == N * D)
    assert(p >= 1)
    assert(C > 0)
    
    Mu = mu.reshape(N, D)
    
    if weighting is True:
        KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
        KNegAll = K - KPosAll        # number of negative labels for each example, N by 1
    else:
        KPosAll = np.ones(N)
        KNegAll = np.ones(N)
        
    T1 = np.dot(Mu, Phi.T)  # N by K
    OneN = np.ones(N)
    OneK = np.ones(K)
    P_diag = np.divide(1, KPosAll)  # N by 1
    Q_diag = np.divide(1, KNegAll)  # N by 1
    
    T1p = np.multiply(Y, T1)
    T2 = np.multiply(Y, np.exp(-T1p))
    T3 = T2 * P_diag[:, None]  # N by K
    
    T1n = np.multiply(1-Y, T1)
    T4 = np.multiply(1-Y, np.exp(p * T1n))
    T5 = T4 * Q_diag[:, None]  # N by K
    
    J = np.dot(mu, mu) * 0.5 / C + np.dot(OneN, np.dot(T3 + T5/p, OneK)) / N
    G = Mu / C + np.dot(T5 - T3, Phi) / N
    
    return (J, G.ravel())



In [11]:

    
def obj_pc_latent_loop(mu, Phi, Y, C, p, weighting=True):
    """
        Objective with L2 regularisation and p-classification loss
        
        Input:
            - mu: current latent features, flattened K x D
            - Phi: label embeddings, K x D
            - Y: label matrix, N x K
            - C: regularisation constant, C = 1 / \lambda
            - p: constant for p-classification loss
    """
    K, D = Phi.shape
    N = Y.shape[0]
    assert(mu.shape[0] == N * D)
    assert(p >= 1)
    assert(C > 0)
    
    Mu = mu.reshape(N, D)
    
    J = 0.0  # cost
    G = np.zeros_like(Mu)  # gradient matrix
    if weighting is True:
        KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
        KNegAll = K - KPosAll        # number of negative labels for each example, N by 1
    else:
        KPosAll = np.ones(N)
        KNegAll = np.ones(N)
        
    for n in range(N):
        for k in range(K):
            if Y[n, k] == 1:
                t1 = np.exp(-np.dot(Phi[k, :], Mu[n, :])) / KPosAll[n]
                J += t1
                G[n, :] = G[n, :] - Phi[k, :] * t1
            else:
                t2 = np.exp(p * np.dot(Phi[k, :], Mu[n, :])) / (p * KNegAll[n])
                J += t2
                G[n, :] = G[n, :] + Phi[k, :] * p * t2
                
    J = np.dot(mu, mu) * 0.5 / C + J / N
    G = Mu / C + G / N
    
    return (J, G.ravel())

Check gradient



In [12]:

    
def avgF1(Y_true, Y_pred):
    #THs = [0, 0.05, 0.10, 0.15, 0.2, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70, 0.75]  # SPEN THs
    THs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    F1 = Parallel(n_jobs=-1)(delayed(f1_score_nowarn)(Y_true, Y_pred >= th, average='samples') for th in THs)
    bestix = np.argmax(F1)
    print('\nbest threshold: %g, best F1: %g, #examples: %g' % (THs[bestix], F1[bestix], Y_true.shape[0]))
    return F1[bestix]



In [15]:

    
clf = pkl.load(open(fmodel_f1, 'rb'))



In [19]:

    
Phi = clf.best_estimator_.W



In [ ]:

    
mu0 = 0.001 * np.random.randn(Y_train.shape[0] * Phi.shape[1])
check_grad(lambda mu: obj_pc_latent(mu, Phi, Y_train, C=1, p=2)[0], 
           lambda mu: obj_pc_latent(mu, Phi, Y_train, C=1, p=2)[1], mu0)



In [21]:

    
def cmp_loop_vec(func_loop, func_vec, Phi, Y, p=2):
    print('%15s %15s %15s %15s %15s' % ('C','J_Diff', 'J_loop', 'J_vec', 'G_Diff'))
    mu0 = 0.001 * np.random.randn(Y.shape[0] * Phi.shape[1])
    for e in range(-6, 10):
        C = 10**(e)
        J,  G  = func_loop(mu0, Phi, Y, C, p=p)
        J1, G1 = func_vec( mu0, Phi, Y, C, p=p)
        Gdiff = G1 - G
        print('%15g %15g %15g %15g %15g' % (C, J1 - J, J, J1, np.dot(Gdiff, Gdiff)))



In [22]:

    
cmp_loop_vec(obj_pc_latent_loop, obj_pc_latent, Phi, Y_train, p=2)









    



              C          J_Diff          J_loop           J_vec          G_Diff
          1e-06               0     4.47725e+06     4.47725e+06               0
          1e-05               0          447726          447726     1.26218e-29
         0.0001               0         44773.9         44773.9     1.92593e-34
          0.001               0         4478.74         4478.74     7.30658e-32
           0.01               0         449.224         449.224     4.72832e-33
            0.1     1.42109e-14         46.2724         46.2724     6.14838e-34
              1     1.15463e-14         5.97724         5.97724      5.4617e-35
             10     1.19904e-14         1.94772         1.94772     5.67198e-36
            100     1.19904e-14         1.54477         1.54477     5.97112e-37
           1000     1.19904e-14         1.50448         1.50448     1.45938e-37
          10000     1.19904e-14         1.50045         1.50045     1.16612e-37
         100000     1.19904e-14         1.50004         1.50004     1.13834e-37
          1e+06     1.19904e-14             1.5             1.5     1.13478e-37
          1e+07     1.19904e-14             1.5             1.5     1.13446e-37
          1e+08     1.19904e-14             1.5             1.5     1.13443e-37
          1e+09     1.19904e-14             1.5             1.5     1.13443e-37



In [ ]:

    
class PC_latent(BaseEstimator):
    """All methods are necessary for a scikit-learn estimator"""
    
    def __init__(self, C=1, p=1, weighting=True):
        """Initialisation"""
        
        assert C >  0
        assert p >= 1
        self.C = C
        self.p = p
        self.weighting = weighting
        self.obj_func = obj_pc_latent
        self.trained = False
        
    def fit(self, Phi_train, Y_train):
        """Model fitting by optimising the objective"""
        opt_method = 'L-BFGS-B' #'BFGS' #'Newton-CG'
        options = {'disp': 1, 'maxiter': 10**5, 'maxfun': 10**5} # , 'iprint': 99}
        sys.stdout.write('\nC: %g, p: %g, weighting: %s' % (self.C, self.p, self.weighting))
        sys.stdout.flush()
        
        K, D = Phi_train.shape
        N = Y_train.shape[0]
        mu0 = 0.001 * np.random.randn(K * D)
        opt = minimize(self.obj_func, mu0, args=(Phi_train, Y_train, self.C, self.p, self.weighting), \
                       method=opt_method, jac=True, options=options)
        if opt.success is True:
            self.Mu = np.reshape(opt.x, (N, D))
            self.trained = True
        else:
            sys.stderr.write('Optimisation failed')
            print(opt.items())
            self.trained = False
            
            
    def decision_function(self, Phi_test):
        """Make predictions (score is real number)"""
        
        assert self.trained is True, "Can't make prediction before training"
        return np.dot(self.Mu, Phi_test.T)  # log of prediction score
        
    
    def predict(self, Phi_test):
        return self.decision_function(Phi_test)
    #    """Make predictions (score is boolean)"""   
    #    preds = sigmoid(self.decision_function(X_test))
    #    #return (preds >= 0)
    #    assert self.TH is not None
    #    return preds >= self.TH        
        
    # inherit from BaseEstimator instead of re-implement
    #
    #def get_params(self, deep = True):
    #def set_params(self, **params):



In [ ]:

    
def dump_results(predictor, X_train, Y_train, X_test, Y_test, rankingLoss=False):
    """
        Compute and save performance results
    """
    preds_train = predictor.decision_function(X_train)
    preds_test  = predictor.decision_function(X_test)
    
    print('Training set:')
    perf_dict_train = evaluatePrecision(Y_train, preds_train, verbose=1)
    print()
    print('Test set:')
    perf_dict_test = evaluatePrecision(Y_test, preds_test, verbose=1)
    
    if rankingLoss is True:
        print()
        print('Training set:')
        perf_dict_train.update(evaluateRankingLoss(Y_train, preds_train))
        print(label_ranking_loss(Y_train, preds_train))
        print()
        print('Test set:')
        perf_dict_test.update(evaluateRankingLoss(Y_test, preds_test))
        print(label_ranking_loss(Y_test, preds_test))



In [ ]:

    
#C_set = [1e-3, 3e-3, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]
#C_set = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
C_set = [0.01, 0.1, 1, 10, 30, 60, 90, 120, 150]
p_set = [1, 2, 3]
parameters = [{'C': C_set, 'p': p_set, 'weighting': [True]}]
#scorer = {'Prec': make_scorer(avgPrecisionK)}
scorer = {'F1': make_scorer(avgF1)}



In [ ]:

    
if not os.path.exists(fmodel_f1):
    clf = GridSearchCV(MLC_pclassification(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='F1')
    clf.fit(X_train, Y_train)
    pkl.dump(clf, open(fmodel_f1, 'wb'))
else:
    clf = pkl.load(open(fmodel_f1, 'rb'))