Multi-label classification -- top-push loss (dual problem)



In [ ]:

    
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd

from scipy.sparse import coo_matrix
from scipy.optimize import minimize
from scipy.optimize import check_grad
from scipy.optimize.slsqp import _minimize_slsqp
import nlopt
import ipopt

from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns



In [ ]:

    
sys.path.append('src')
from evaluate import avgPrecision, avgPrecisionK, printEvaluation
from datasets import create_dataset_yeast_train, create_dataset_yeast_test, yeast_nLabels
from datasets import create_dataset_emotions_train, create_dataset_emotions_test, emotions_nLabels
from datasets import create_dataset_scene_train, create_dataset_scene_test, scene_nLabels
from datasets import create_dataset_mediamill_subset_train, create_dataset_mediamill_subset_test, mm_nLabels



In [ ]:

    
datasets = ['yeast', 'emotions', 'scene', 'mediamill']
num_labels = [yeast_nLabels, emotions_nLabels, scene_nLabels, mm_nLabels]
create_dataset_train_funcs = [create_dataset_yeast_train, 
                              create_dataset_emotions_train,
                              create_dataset_scene_train, 
                              create_dataset_mediamill_subset_train]
create_dataset_test_funcs  = [create_dataset_yeast_test,
                              create_dataset_emotions_test,
                              create_dataset_scene_test,
                              create_dataset_mediamill_subset_test]



In [ ]:

    
data_ix = 1



In [ ]:

    
dataset_name = datasets[data_ix]
nLabels = num_labels[data_ix]
create_dataset_train = create_dataset_train_funcs[data_ix]
create_dataset_test  = create_dataset_test_funcs [data_ix]
print('Dataset:', dataset_name)

The sigmoid function.



In [ ]:

    
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

top-push loss

Multi-label learning with top push loss.



In [ ]:

    
def obj_top_push(w, X, Y, C):
    """
        Objective with L2 regularisation and top push loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
                 if we normalise the objective J by dividing C, \lambda = 1/C
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == N * K)
    assert(C > 0)
    
    T = w.reshape(N, K)  # theta
        
    J = 0.0  # cost
    G = np.zeros_like(T)  # gradient matrix
    KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    KNegAll = K - KPosAll        # number of negative labels for each example, N by 1
    
    for k in range(K):
        y = Y[:, k]
        
        posVec = np.zeros(N, dtype=np.float)
        negVec = np.zeros(N, dtype=np.float)
        posVec[y == 1] = 1
        negVec[y == 0] = 1
        posVec = np.divide(posVec, KPosAll * N)  # 1 / NK+, N by 1
        negVec = np.divide(negVec, KNegAll * N)  # 1 / NK-, N by 1
        
        a = np.multiply(T[:, k], posVec)  # alpha / NK+ if y_nk = 1 else 0
        b = np.multiply(T[:, k], negVec)  # beta  / NK- if y_nk = 0 else 0
        c = a + b  # gamma
        d = np.sum(X * c[:, None], axis=0)
        
        t1 = T[y == 1, k]
        p1 = posVec[y == 1]
        s1 = -np.log(-t1)
        s2 = np.log(1 + t1)
        J += 0.5 * C * np.dot(d, d) + np.dot(a[y == 1], s1) + np.dot(p1, np.multiply(1+t1, s2))
        
        p2 = negVec[y == 0]
        t2 = np.dot(X, d) * C
        G[y == 1, k] = np.multiply(p1, t2[y == 1] + s1 + s2)
        G[y == 0, k] = np.multiply(p2, t2[y == 0])
        
    return (J, G.ravel())



In [ ]:

    
def obj_top_push_loop(w, X, Y, C):
    """
        Objective with L2 regularisation and top push loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
                 if we normalise the objective J by dividing C, \lambda = 1/C
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == N * K)
    assert(C > 0)
    
    T = w.reshape(N, K)  # theta
    
    J = 0.0  # cost
    G = np.zeros_like(T)  # gradient matrix
    KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    KNegAll = K - KPosAll        # number of negative labels for each example, N by 1
    
    for k in range(K):
        y = Y[:, k]
        
        # cost
        X1 = np.zeros_like(X)
        for n in range(N):
            t = T[n, k] / (N * KPosAll[n]) if y[n] == 1 else T[n, k] / (N * KNegAll[n])
            X1[n, :] = X[n, :] * t
        xs = np.sum(X1, axis=0)
        J += np.dot(xs, xs) * 0.5 * C
        #print(J)
        
        for n in range(N):
            if y[n] == 1:
                print(T[n,k], n, k)
                assert(T[n, k] < 0)
                assert(T[n, k] + 1 > 0)
                J += (-T[n, k] * np.log(-T[n, k]) + (1 + T[n, k]) * np.log(1 + T[n, k])) / (N * KPosAll[n])
        
        # gradient
        for n in range(N):
            if y[n] == 1:
                ga = C * np.dot(np.sum(X1, axis=0), X[n, :]) - np.log(-T[n, k]) + np.log(1 + T[n, k])
                G[n, k] = G[n, k] + ga / (N * KPosAll[n])
            else:
                gb = C * np.dot(np.sum(X1, axis=0), X[n, :])
                G[n, k] = G[n, k] + gb / (N * KNegAll[n])
        
    return (J, G.ravel())



In [ ]:

    
def dual2primal(w, X, Y, C):
    """
        Compute primal variable values given dual variable values
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
                 if we normalise the objective J by dividing C, \lambda = 1/C
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == N * K)
    assert(C > 0)
    
    W = np.zeros((K, D), dtype=np.float)
    T = w.reshape(N, K)  # theta
    
    KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    KNegAll = K - KPosAll        # number of negative labels for each example, N by 1
    
    for k in range(K):
        y = Y[:, k]
        
        posVec = np.zeros(N, dtype=np.float)
        negVec = np.zeros(N, dtype=np.float)
        posVec[y == 1] = 1
        negVec[y == 0] = 1
        posVec = np.divide(posVec, KPosAll / N)
        negVec = np.divide(negVec, KNegAll / N)
        
        t = np.multiply(T[:, k], posVec) + np.multiply(T[:, k], negVec)  # gamma
        W[k, :] = -C * np.sum(X * t[:, None], axis=0)
        
    return W



In [ ]:

    
def dual2primal_loop(w, X, Y, C):
    """
        Compute primal variable values given dual variable values
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
                 if we normalise the objective J by dividing C, \lambda = 1/C
    """
    N, D = X.shape
    K = Y.shape[1]
    assert(w.shape[0] == N * K)
    assert(C > 0)
    
    W = np.zeros((K, D), dtype=np.float)
    T = w.reshape(N, K)  # theta
    
    KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
    KNegAll = K - KPosAll        # number of negative labels for each example, N by 1
    
    for k in range(K):
        y = Y[:, k]
        X1 = X.copy()
        for n in range(N):
            t = T[n, k] / (N * KPosAll[n]) if y[n] == 1 else T[n, k] / (N * KNegAll[n])
            X1[n, :] = X1[n, :] * t
        W[k, :] = -C * np.sum(X1, axis=0)
        
    return W



In [ ]:

    
#aa = np.array([0,1])
#-1 * (2 * aa - 1)



In [ ]:

    
def init_var(X, Y, seed=None):
    """
        Initialise the dual variables
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
        
        Output:
            - W: matrix with random numbers
                 W_{n,k} \in (-1,0) if Y_{n,k} = 1, 
                 W_{n,k} \in (0, 1) if Y_{n,k} = 0.
    """
    N, D = X.shape
    K = Y.shape[1]
    
    I = -1 * (2 * Y - 1)
    if seed is not None: np.random.seed(seed)
    w0 = np.multiply(np.random.rand(N * K).reshape(N, K), I).ravel()
    
    return w0



In [ ]:

    
def init_var_feasible(X, Y):
    """
        Initialise the dual variables
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
        
        Output:
            - W: matrix with real numbers that satisfy all constraints
                 W_{n,k} \in (-1,0) if Y_{n,k} = 1, 
                 W_{n,k} \in (0, 1) if Y_{n,k} = 0.
    """
    N, D = X.shape
    K = Y.shape[1]
    
    #I = -1 * (2 * Y - 1)
    #if seed is not None: np.random.seed(seed)
    #w0 = np.multiply(np.random.rand(N * K).reshape(N, K), I).ravel()
    #return w0
    
    # NOTE: 
    # simply let
    # alpha_{n,k} = -v, 0 < v < 1
    # beta_{n,k} = v
    # will satisfy the equality constraints

    I = -1 * (2 * Y - 1)
    
    return np.ravel(0.1 * I)



In [ ]:

    
def obj_constraint(w, n, X, Y):
    """
        Compute the constraints for top push loss
        
        Input:
            - w: weights vector, N*K x 1
            - n: the n-th constraint
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
        
        Output:
            - 0 if all equality constraits satisfy (equal to 0)
            - 1 otherwise
    """
    N, D = X.shape
    K = Y.shape[1]
    T = w.reshape(N, K)  # theta
    assert 0 <= n < N
    
    KPos = np.sum(Y[n, :])  # number of positive labels for the n-th example
    KNeg = K - KPos         # number of negative labels for the n-th example
    yn = Y[n, :]
    
    return (np.sum(T[n, yn == 1]) / KPos + np.sum(T[n, yn == 0]) / KNeg) / N



In [ ]:

    
def grad_constraint(w, n, X, Y):
    """
        Compute the gradient of constraints for top push loss
        
        Input:
            - w: weights vector, N*K x 1
            - n: the n-th constraint
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
        
        Output:
            - an array of gradients
    """
    N, D = X.shape
    K = Y.shape[1]
    T = w.reshape(N, K)  # theta
    assert 0 <= n < N
    
    KPos = np.sum(Y[n, :])  # number of positive labels for the n-th example
    KNeg = K - KPos         # number of negative labels for the n-th example
    yn = Y[n, :]
    
    pos = np.zeros(K, dtype=np.float)
    neg = np.zeros(K, dtype=np.float)
    pos[yn == 1] = 1
    neg[yn == 0] = 1
    pos = np.divide(pos, KPos / N)
    neg = np.divide(neg, KNeg / N)
        
    return pos + neg



In [ ]:

    
class toppush(object):
    
    def __init__(self, X, Y, C):
        """
        Initialisation and computing shared data.
            Input:
                - X: feature matrix, N x D
                - Y: label matrix,   N x K
                - C: regularisation constant
        """
        self.N, self.D = X.shape
        self.K = Y.shape[1]
        assert(C > 0)
        self.C = C
        self.X = X
        self.Y = Y

        #print(X.shape, Y.shape)
        KPosAll = np.sum(Y, axis=1)  # number of positive labels for each example, N by 1
        #print(KPosAll.shape)
        KNegAll = self.K - KPosAll        # number of negative labels for each example, N by 1
        self.coefMat = np.ones((self.N, self.K), dtype=np.float)  # v_{n,k} = 1/NK+ is y_{n,k} = 1 else 1/NK-
        
        for k in range(self.K):
            y = Y[:, k]
            posVec = np.divide(y,   KPosAll * self.N)  # 1 / NK+, N by 1
            negVec = np.divide(1-y, KNegAll * self.N)  # 1 / NK-, N by 1
            self.coefMat[:, k] = posVec + negVec


    def objective(self, x):
        #
        # The callback for calculating the objective
        #
        w = x
        assert(w.shape[0] == self.N * self.K)
        T = w.reshape(self.N, self.K)  # theta
        J = 0.0
        Gama = np.multiply(T, self.coefMat)  # gamma
        for k in range(self.K):
            y = self.Y[:, k]
            d = np.sum(self.X * Gama[:, k][:, None], axis=0)
            a = T[y == 1, k]  # alpha
            J += 0.5 * self.C * np.dot(d, d)
            J += np.dot(self.coefMat[y == 1, k], np.multiply(-a, np.log(-a)) + np.multiply(1+a, np.log(1+a)))
        return J
    

    def gradient(self, w):
        #
        # The callback for calculating the gradient
        #
        T = w.reshape(self.N, self.K)   # theta
        Gama = np.multiply(T, self.coefMat)  # gamma
        G = np.zeros_like(T)  # gradient matrix
        for k in range(self.K):
            y = self.Y[:, k]
            a = T[y == 1, k]  # alpha
            d = np.sum(self.X * Gama[:, k][:, None], axis=0)
            t = np.dot(self.X, d) * self.C
            G[y == 1, k] = np.multiply(self.coefMat[y == 1, k], t[y == 1] - np.log(-a) + np.log(1+a))
            G[y == 0, k] = np.multiply(self.coefMat[y == 0, k], t[y == 0])
        return G.ravel()
    

    def constraints(self, w):
        #
        # The callback for calculating the constraints
        #
        T = w.reshape(self.N, self.K)   # theta
        Gama = np.multiply(T, self.coefMat)  # gamma
        return np.sum(Gama, axis=1)
    

    def jacobian(self, w):
        #
        # The callback for calculating the Jacobian of constraints
        #
        jac = np.tile(np.zeros((self.N, self.K)), (self.N,1))
        ix = [n * (self.N + 1) for n in range(self.N)]  # [0, N+1, 2N+2, ..., N^2-1]
        jac[ix, :] = self.coefMat
        return jac.ravel()

    
    def hessian(self, w, lagrange_multipliers, obj_factor):
        #
        # The callback for calculating the Hessian of both objective and constraints
        #
        M1 = np.repeat(self.X, repeats=self.K, axis=0)  # repeat each row K times, NK by K
        M2 = M1 * self.coefMat.flatten()[:, None]  # scale each row in T1
        M3 = np.dot(M2, M2.T)  # NK by NK
        
        T = w.reshape(self.N, self.K)   # theta
        T1 = np.divide(1, np.multiply(T, 1 + T))
        A1 = np.multiply(np.multiply(T1, self.Y), self.coefMat)
        
        diagix = np.arange(M3.shape[0])
        M3[diagix, diagix] = M3[diagix, diagix] - A1.ravel()  # change the diagonal values
        H = obj_factor * np.tril(M3) # NK by NK
        
        # Linear constraints: Hession of all constraints are zeros
        return H.ravel()

    
    def intermediate(
            self,
            alg_mod,
            iter_count,
            obj_value,
            inf_pr,
            inf_du,
            mu,
            d_norm,
            regularization_size,
            alpha_du,
            alpha_pr,
            ls_trials
        ):
        #
        # Example for the use of the intermediate callback.
        #
        print("Objective value at iteration #%d is: %g" % (iter_count, obj_value))

Check gradient



In [ ]:

    
X_train, Y_train = create_dataset_train()
X_test,  Y_test  = create_dataset_test()



In [ ]:

    
X_train.shape



In [ ]:

    
%%script false
# testing func
def obj_func(w):
    # f = xy + x^2
    x = w[0]
    y = w[1]
    return (x*y + x*x, np.asarray([y + 2*x, x]))

eps = 1.49e-08
w0 = np.random.rand(2)
w = np.zeros_like(w0)
for i in range(len(w0)):
    wi1 = w0.copy()
    wi2 = w0.copy()
    wi1[i] = wi1[i] - eps
    wi2[i] = wi2[i] + eps
    J1, _ = obj_func(wi1)
    J2, _ = obj_func(wi2)
    w[i] = (J2 - J1) / (2 * eps)
J, w1 = obj_func(w0)
wdiff = w1 - w
print(np.dot(wdiff, wdiff))



In [ ]:

    
%%script false
C = 1
eps = 1.49e-08
w0 = init_var(X_train[:100, :], Y_train[:100, :])
w = np.zeros_like(w0)
for i in range(len(w0)):
    wi1 = w0.copy()
    wi2 = w0.copy()
    wi1[i] = wi1[i] - eps
    wi2[i] = wi2[i] + eps
    J1, _ = obj_top_push_loop(wi1, X_train[:100, :], Y_train[:100, :], C)
    J2, _ = obj_top_push_loop(wi2, X_train[:100, :], Y_train[:100, :], C)
    w[i] = (J2 - J1) / (2 * eps)
    #print(w[i])
J, w1 = obj_top_push_loop(w0, X_train[:100, :], Y_train[:100, :], C)
diff = w1 - w
np.dot(diff, diff)



In [ ]:

    
%%script false
C = 1
w0 = init_var(X_train[:100, :], Y_train[:100, :])
check_grad(lambda w: obj_top_push_loop(w, X_train[:100, :], Y_train[:100, :], C)[0], 
           lambda w: obj_top_push_loop(w, X_train[:100, :], Y_train[:100, :], C)[1], w0)



In [ ]:

    
%%script false
N = X_train.shape[0]
K = Y_train.shape[1]
print('%15s %15s %15s %15s' % ('J_Diff', 'J_loop', 'J_vec', 'G_Diff'))
for e in range(-6, 10):
    C = 10**(e)
    w0 = init_var(X_train, Y_train)
    J,  G  = obj_top_push_loop(w0, X_train, Y_train, C)
    J1, G1 = obj_top_push(w0, X_train, Y_train, C)
    Gdiff = G1 - G
    #print('%-15g %-15g %-15g' % (J1 - J, J, J1))
    print('%15g %15g %15g %15g' % (J1 - J, J, J1, np.dot(Gdiff, Gdiff)))



In [ ]:

    
%%script false
C = 1
w0 = init_var(X_train, Y_train)
check_grad(lambda w: obj_top_push(w, X_train, Y_train, C)[0], 
           lambda w: obj_top_push(w, X_train, Y_train, C)[1], w0)
#%lprun -f obj_top_push check_grad(lambda w: obj_top_push(w, X_train, Y_train, C)[0], \
#                                  lambda w: obj_top_push(w, X_train, Y_train, C)[1], w0)



In [ ]:

    
#w0 = init_var(X_train[:100, :], Y_train[:100, :])
#check_grad(lambda w: obj_constraint(w, 0, X_train[:100, :], Y_train[:100, :]), 
#           lambda w: grad_constraint(w, 0, X_train[:100, :], Y_train[:100, :]), w0)



In [ ]:

    
X_train.shape



In [ ]:

    
Y_train.shape



In [ ]:

    
def minimise_cost_ipopt(X, Y, C):
    """
        Minimise the cost using Interior Point method provided by ipopt
        see https://projects.coin-or.org/Ipopt for details.
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant
        
        Output:
            - optx: the optimal solution (N*K x 1) if succeed, None otherwise
            - success: True if succeed else False
    """
    N, D = X.shape
    K = Y.shape[1]
    M = N * K  # number of parameters
    
    w0 = init_var_feasible(X, Y)
    eps = 1e-12
    lb_var = np.array([-1 + eps if wi < 0 else 0 for wi in w0],      dtype=np.float64)  # lower bounds
    ub_var = np.array([ 0 - eps if wi < 0 else 2.0e19 for wi in w0], dtype=np.float64)  # upper bounds
    lb_cons = np.array([0 for n in range(N)])
    ub_cons = np.array([0 for n in range(N)])
    
    prob = ipopt.problem(
        n = len(w0), 
        m = N, 
        problem_obj = toppush(X, Y, C), 
        lb = lb_var, 
        ub = ub_var,
        cl = lb_cons,
        cu = ub_cons
    )
    #prob.addOption('mu_strategy', 'adaptive')
    #prob.addOption('tol', 1e-7)
    #prob.addOption('max_iter', 1e4)
    xopt, info = prob.solve(w0)
    
    print(info.items())
    return (xopt, True)



In [ ]:

    
nlopt.algorithm_name(nlopt.LD_AUGLAG)



In [ ]:

    
def minimise_cost_nlopt(X, Y, C):
    """
        Minimise the cost using Augmented Lagrangian algorithm and L-BFGS provided by NLopt,
        see https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/#augmented-lagrangian-algorithm for details.
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant
        
        Output:
            - optx: the optimal solution (N*K x 1) if succeed, None otherwise
            - success: True if succeed else False
    """
    N, D = X.shape
    K = Y.shape[1]
    M = N * K  # number of parameters
    
    obj_func = obj_top_push_loop
    #obj_func = obj_top_push
    
    # objective
    def _func(w, grad):
        obj, G = obj_func(w, X, Y, C)
        if grad.size > 0:
            grad[:] = G  # inplace write, eqv to np.copyto(grad, G)
        return obj
    
    # constraints
    def _cons(w, grad, n):
        if grad.size > 0:
            T = np.zeros((N, K))
            T[n, :] = grad_constraint(w, n, X, Y)
            grad[:] = T.ravel()
        return obj_constraint(w, n, X, Y)
        
    opt = nlopt.opt(nlopt.LD_AUGLAG, M)
    opt.set_min_objective(_func)  # to minimise objective function
    
    local_opt = nlopt.opt(nlopt.LD_LBFGS, M)
    opt.set_local_optimizer(local_opt)
    
    eps = 1e-12
    w0 = init_var_feasible(X, Y)
    lb = np.array([-1 + eps if wi < 0 else 0 for wi in w0],      dtype=np.float64)  # lower bounds
    ub = np.array([ 0 - eps if wi < 0 else np.inf for wi in w0], dtype=np.float64)  # upper bounds
    opt.set_lower_bounds(lb)
    opt.set_upper_bounds(ub)
    
    for n in range(N): 
        opt.add_equality_constraint(lambda x, grad: _cons(x, grad, n))
        
    opt.set_ftol_rel(2.220446049250313e-09)  # same as scipy.optimize.minimize for L-BFGS-B
    opt.set_maxtime(2 * 60 * 60)  # time limit: 2 hours
    
    try:
        xopt = opt.optimize(w0)                # optimal solution
        opt_val  = opt.last_optimum_value()    # optimal value
        res_code = opt.last_optimize_result()  # result code
        if res_code == nlopt.SUCCESS:
            return (xopt, True)
        else:
            sys.stderr.write('Optimisation failed: (result code %d)' % res)
            return (None, False)
    except:
        sys.stderr.write('Optimisation failed')
        raise



In [ ]:

    
def minimise_cost_slsqp(X, Y, C):
    """
        Minimise the cost using SLSQP (Sequential Least SQuares Programming) algorithm
        
        Input:
            - X: feature matrix, N x D
            - Y: label matrix,   N x K
            - C: regularisation constant
        
        Output:
            - optx: the optimal solution (N*K x 1) if succeed, None otherwise
            - success: True if succeed else False
        
        NOTE: SLSQP is less practical for optimizing more than a few thousand parameters due to space and 
              time complexity, see https://nlopt.readthedocs.io/en/latest/NLopt_Algorithms/#slsqp for details.
    """

    opt_method = 'SLSQP'
    options = {'disp': True, 'maxiter': 100}
    if options['disp']: 
        print('\nC: %g' % C)

    N = X.shape[0]

    # NOTE: initial point should be feasible, otherwise optimisation will fail
    w0 = init_var_feasible(X, Y)
    #bnds = [(-1, 0) if wi < 0 else (0, None) for wi in w0]  # the min/left bound is inclusion
    eps = 1e-12
    bnds = [(-1+eps, 0-eps) if wi < 0 else (0, None) for wi in w0]
    cons = [{'type': 'eq', 'fun': obj_constraint, 'args': (n, X, Y)} for n in range(N)]
    #obj_func = obj_top_push_loop
    obj_func = obj_top_push
    opt = minimize(obj_func, w0, args=(X, Y, C), method=opt_method, jac=True, \
                   bounds=bnds, constraints=cons, options=options)
    
    #print(opt.success)
    #print(type(opt.success))
    #print(opt.success == np.True_)
    # NOTE: opt.success is of type 'numpy.bool_' which is different from python bool
    #if opt.success is True:
    if opt.success is np.True_:        
        return (opt.x, True)
    else:
        sys.stderr.write('Optimisation failed: \n')
        #print(opt.items())
        return (None, False)



In [ ]:

    
N, K = X_train.shape

# code in scipy.optimize.slsqp._minimize_slsqp()
meq = N
mieq = 0
m = meq + mieq
la = np.array([1, m]).max()
n = N*K
n1 = n + 1
mineq = m - meq + n1 + n1
len_w = (3*n1+m)*(n1+1)+(n1-meq+1)*(mineq+2) + 2*mineq+(n1+mineq)*(n1-meq) + \
        2*meq + n1 + ((n+1)*n)//2 + 2*m + 3*n + 3*n1 + 1
print(len_w)
print(np.log2(len_w))
#e = 32
#np.zeros(2**(e))
#np.zeros(len_w)



In [ ]:

    
def minimise_cost(X, Y, C):
    #opt_func = minimise_cost_slsqp
    #opt_func = minimise_cost_nlopt
    opt_func = minimise_cost_ipopt
    return opt_func(X, Y, C)

class MLC_toppush(BaseEstimator):
    """All methods are necessary for a scikit-learn estimator"""
    
    def __init__(self, p=1, C=1):
        """Initialisation"""
        
        assert C > 0
        self.C = C
        self.trained = False
        
    def fit(self, X_train, Y_train):
        """Model fitting by optimising the objective"""
        optx, success = minimise_cost(X_train, Y_train, self.C)
        if success is True:
            self.W = dual2primal(optx, X_train, Y_train, self.C)
            self.trained = True
        else:
            self.trained = False
            
            
    def decision_function(self, X_test):
        """Make predictions (score is real number)"""
        
        assert self.trained is True, "Can't make prediction before training"
        D = X_test.shape[1]
        return np.dot(X_test, self.W.T)
        
    
    def predict(self, X_test):
        """Make predictions (score is boolean)"""
        
        preds = self.decision_function(X_test)
        return (preds > 0)
    
    
    def score(self, X, Y):
        """Compute scoring metric"""
        
        allPreds = self.decision_function(X)
        return avgPrecisionK(Y, allPreds)
    
    # inherit from BaseEstimator instead of re-implement
    #
    #def get_params(self, deep = True):
    #def set_params(self, **params):



In [ ]:

    
#%mprun
model = MLC_toppush()
model.fit(X_train[:50], Y_train[:50])
#%memit model.fit(X_train[:30], Y_train[:30])
#%mprun -f minimize model.fit(X_train[:100], Y_train[:100])
#%mprun -f _minimize_slsqp model.fit(X_train[:10], Y_train[:10])



In [ ]:

    
parameters = [{'C': [10**(e) for e in range(-6,-1)]}]

clf = GridSearchCV(MLC_toppush(), parameters, cv=5, n_jobs=1)
clf.fit(X_train[:100], Y_train[:100])

print("\nBest parameters set found on development set:")
print(clf.best_params_)



In [ ]:

    
for mean, std, params in zip(clf.cv_results_['mean_test_score'], clf.cv_results_['std_test_score'], \
                             clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))



In [ ]:

    
clf = model



In [ ]:

    
preds_train = clf.decision_function(X_train)
preds_test  = clf.decision_function(X_test)



In [ ]:

    
print('Training set:')
printEvaluation(Y_train, preds_train)
print()
print('Test set:')
printEvaluation(Y_test, preds_test)

Result analysis



In [ ]:

    
precisions_train = [avgPrecision(Y_train, preds_train, k) for k in range(1, nLabels+1)]
precisions_test  = [avgPrecision(Y_test,  preds_test,  k) for k in range(1, nLabels+1)]



In [ ]:

    
precisionK_train = avgPrecisionK(Y_train, preds_train)
precisionK_test  = avgPrecisionK(Y_test,  preds_test)



In [ ]:

    
plt.figure(figsize=[10,5])
plt.plot(precisions_train, ls='--', c='r', label='Train')
plt.plot(precisions_test,  ls='-',  c='g', label='Test')
plt.plot([precisionK_train for k in range(nLabels)], ls='-', c='r', label='Train, Precision@K')
plt.plot([precisionK_test  for k in range(nLabels)], ls='-', c='g', label='Test, Precision@K')
plt.xticks(np.arange(nLabels), np.arange(1,nLabels+1))
plt.xlabel('k')
plt.ylabel('Precision@k')
plt.legend(loc='best')
plt.title('MLC w. Top-push Loss on ' + dataset_name + ' dataset')
plt.savefig(dataset_name + '_tp.svg')