A simple example of multilabel learning


In [ ]:
%matplotlib inline

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn as sk
import cython
import itertools

from scipy.io import arff
from scipy.optimize import minimize
from scipy.optimize import check_grad

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns

In [ ]:
data_dir = 'data'
#yeast_ftrain = os.path.join(data_dir, 'yeast/yeast-train.arff')
#yeast_ftest  = os.path.join(data_dir, 'yeast/yeast-test.arff')
#bibtex_ftrain = os.path.join(data_dir, 'bibtex/bibtex-train.arff')
#bibtex_ftest  = os.path.join(data_dir, 'bibtex/bibtex-test.arff')
#fbookmarks = os.path.join(data_dir, 'bookmarks/bookmarks.arff')
mm_ftrain = os.path.join(data_dir, 'mediamill/mediamill-train.arff')
mm_ftest  = os.path.join(data_dir, 'mediamill/mediamill-test.arff')

In [ ]:
SEED = 123456789

Data loading

Load yeast dataset.


In [ ]:
#data_train, meta_train = arff.loadarff(yeast_ftrain)
#data_train, meta_train = arff.loadarff(open(bibtex_ftrain))
#data_bookmarks = arff.loadarff(open(fbookmarks))
data_train, meta_train = arff.loadarff(mm_ftrain)

In [ ]:
#data_test, meta_test = arff.loadarff(yeast_ftest)
#data_test, meta_test = arff.loadarff(bibtex_ftest)
data_test, meta_test = arff.loadarff(mm_ftest)

In [ ]:
type(data_train)

In [ ]:
print(data_train[0])

In [ ]:
len(list(data_train[0]))

In [ ]:
len(list(data_train[0])[120:])

In [ ]:
len(list(data_train[0])[:120])

Features


In [ ]:
#nFeatures = np.array(list(data_train[0])[:-14], dtype=np.float).shape[0]
nFeatures = np.array(list(data_train[0])[:120], dtype=np.float).shape[0]
print('#features:', nFeatures)

In [ ]:
#np.array(list(data_train[0])[:-14], dtype=np.float)

Labels


In [ ]:
#nLabels = np.array(list(data_train[0])[-14:], dtype=np.int).shape[0]
nLabels = np.array(list(data_train[0])[120:], dtype=np.int).shape[0]
print('#labels:', nLabels)

In [ ]:
#np.array(list(data_train[0])[-14:], dtype=np.int)

Data analysis


In [ ]:
print('#training examples:', len(data_train))

In [ ]:
print('#test examples:', len(data_test))

Histogram of #positive labels.


In [ ]:
#nPositives = [np.sum(np.array(list(data_train[ix])[-14:], dtype=np.int)) for ix in range(len(data_train))]
nPositives = [np.sum(np.array(list(data_train[ix])[120:], dtype=np.int)) for ix in range(len(data_train))]

In [ ]:
pd.Series(nPositives).hist(bins=10)

Dataset creation


In [ ]:
def create_dataset(label_ix, data):
    """
        Create the labelled dataset for a given label index
        
        Input:
            - label_ix: label index, number in { 0, ..., # labels }
            - data: original data with features + labels
            
        Output:
            - (Feature, Label) pair (X, y)
              X comprises the features for each example
              y comprises the labels of the corresponding example
    """

    assert(label_ix >= 0)
    assert(label_ix < nLabels)

    N = len(data)
    d = nFeatures
    
    #magic = -14
    magic = 120

    X = np.zeros((N, d), dtype = np.float)
    y = np.zeros(N, dtype = np.int)
       
    for i in range(N):
        X[i, :] = list(data[i])[:magic]
        y[i]    = list(data[i])[magic:][label_ix]

    return X, y

In [ ]:
def create_dataset_v2(data):
    """
        Create the labelled dataset for a given label index
        
        Input:
            - data: original data with features + labels
            
        Output:
            - (Feature, Label) pair (X, y)
              X comprises the features for each example
              Y comprises the labels of the corresponding example
    """

    N = len(data)
    D = nFeatures
    L = nLabels

    #magic = -14
    magic = 120

    X = np.zeros((N, D), dtype = np.float)
    Y = np.zeros((N, L), dtype = np.int)
       
    for i in range(N):
        X[i, :] = list(data[i])[:magic]
        Y[i, :] = list(data[i])[magic:]

    return X, Y

Evaluation

The sigmoid function.


In [ ]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

Loss between a ground truth and a prediction.


In [ ]:
def evalPred(truth, pred, lossType = 'Hamming'):
    """
        Compute loss given ground truth and prediction
        
        Input:
            - truth:    binary array of true labels
            - pred:     real-valued array of predictions
            - lossType: can be subset 0-1, Hamming, ranking, and Precision@K where K = # positive labels.
    """

    assert(len(truth) == len(pred))
    L = len(truth)
    nPos = np.sum(truth)
    
    predBin = np.array((pred > 0), dtype=np.int)
    
    if lossType == 'Subset01':
        return 1 - int(np.all(truth == predBin))
    
    elif lossType == 'Hamming':
        return np.sum(truth != predBin) / L
    
    elif lossType == 'Ranking':
        loss = 0
        for i in range(L-1):
            for j in range(i+1, L):
                if truth[i] > truth[j]:
                    if pred[i] < pred[j]: 
                        loss += 1
                    if pred[i] == pred[j]:
                        loss += 0.5
        #return loss / (nPos * (L-nPos))
        return loss
        
    elif lossType == 'Precision@K':
        # sorted indices of the labels most likely to be +'ve
        idx = np.argsort(pred)[::-1]
        
        # true labels according to the sorted order
        y = truth[idx]
        
        # fraction of +'ves in the top K predictions
        return np.mean(y[:nPos])if nPos > 0 else 0
    
    elif lossType == 'Precision@3':
        # sorted indices of the labels most likely to be +'ve
        idx = np.argsort(pred)[::-1]
        
        # true labels according to the sorted order
        y = truth[idx]
        
        # fraction of +'ves in the top K predictions
        return np.mean(y[:3])
    
    elif lossType == 'Precision@5':
        # sorted indices of the labels most likely to be +'ve
        idx = np.argsort(pred)[::-1]
        
        # true labels according to the sorted order
        y = truth[idx]
        
        # fraction of +'ves in the top K predictions
        return np.mean(y[:5])
    
    else:
        assert(False)

In [ ]:
def avgPrecisionK(allTruths, allPreds):
    losses = []
    lossType = 'Precision@K'
    for i in range(allPreds.shape[0]):
        pred  = allPreds[i, :]
        truth = allTruths[i, :]
        losses.append(evalPred(truth, pred, lossType))
    return np.mean(losses)

In [ ]:
def printEvaluation(allTruths, allPreds):
    
    N = allTruths.shape[0]
    print(N)

    for lossType in ['Precision@K']: 
        # ['Subset01', 'Hamming', 'Ranking', 'Precision@K', 'Precision@3', 'Precision@5']:
        losses = [ ]
        for i in range(allPreds.shape[0]):
            pred  = allPreds[i, :]
            truth = allTruths[i, :]
            losses.append(evalPred(truth, pred, lossType))

            #print(allPreds[i])
            #print(pred)
            #print(truth)
            #break

        #print('%24s: %1.4f' % ('Average %s Loss' % lossType, np.mean(losses)))
        print('%s: %1.4f, %.3f' % ('Average %s' % lossType, np.mean(losses), np.std(losses) / np.sqrt(N)))
        #plt.hist(aucs, bins = 10);

Binary relevance baseline

Train a logistic regression model for each label.


In [ ]:
classifiers = [ LogisticRegression(class_weight = 'balanced', C = 10**0) for i in range(nLabels) ]
#classifiers = [ LogisticRegression(class_weight = 'balanced', C = 10) for i in range(nLabels) ]

In [ ]:
allPreds_train  = [ ]
allPreds_test  = [ ]
allTruths_train = [ ]
allTruths_test = [ ]
coefMat = [ ]
labelIndices = [ ]

for label_ix in range(nLabels):
    print('Training for Label %d' % (label_ix+1))
    X_train, y_train = create_dataset(label_ix, data = data_train)
    X_test, y_test   = create_dataset(label_ix, data = data_test)
    
    allTruths_train.append(y_train)
    allTruths_test.append(y_test) 
    
    assert( (not np.all(y_train == 0)) and (not np.all(y_train == 1)) )

    classifiers[label_ix].fit(X_train, y_train)
    allPreds_train.append(classifiers[label_ix].decision_function(X_train))
    allPreds_test.append(classifiers[label_ix].decision_function(X_test))

    coefMat.append(classifiers[label_ix].coef_.reshape(-1))
    #labelIndices.append(label_ix)
    #print(classifiers[label_ix].coef_)
    #print(classifiers[label_ix].intercept_)

In [ ]:
allTruths_train = np.array(allTruths_train).T
allTruths_test = np.array(allTruths_test).T

allPreds_train  = np.array(allPreds_train).T
allPreds_test  = np.array(allPreds_test).T


print(allPreds_test.shape)
print(allTruths_test.shape)

In [ ]:
#allPreds[0]

In [ ]:
print('Training set:')
printEvaluation(allTruths_train, allPreds_train)

In [ ]:
print('Test set:')
printEvaluation(allTruths_test, allPreds_test)

Result analysis

Coefficient matrix (#Genres, #Songs).


In [ ]:
coefMat = np.array(coefMat).T

In [ ]:
coefMat.shape

In [ ]:
#sns.heatmap(coefMat[:, :30])

Binary relevance with exponential loss

Train a regression model with exponential loss for each label.


In [ ]:
def obj_exp(w, X, y, C):
    """
        Objective with L2 regularisation and exponential loss
        
        Input:
            - w: current weight vector
            - X: feature matrix, N x D
            - y: label vector,   N x 1
            - C: regularisation constant
    """
    assert(len(y) == X.shape[0])
    assert(len(w) == X.shape[1])
    assert(C >= 0)
    
    N, D = X.shape
    
    J = 0.0  # cost
    g = np.zeros_like(w)  # gradient
    
    for n in range(N):
        x = X[n, :]
        prod = np.dot(w, x)
        
        # negative label
        if y[n] == 0:
            t1 = np.exp(prod)
            J += t1
            g = g + t1 * x
        
        # positive label
        else:
            t2 = np.exp(-prod)
            J += t2
            g = g - t2 * x
    
    J = 0.5 * C * np.dot(w, w) + J / N
    g = C * w + g / N
    
    return (J, g)

Check gradient.


In [ ]:
#%%script false
X_train_, y_train_ = create_dataset(3, data = data_train)
w0 = np.random.rand(X_train_.shape[1])
C = 1
check_grad(lambda w: obj_exp(w, X_train_, y_train_, C)[0], \
           lambda w: obj_exp(w, X_train_, y_train_, C)[1], w0)

In [ ]:
params    = [ ]
allPreds_train  = [ ]
allPreds_test  = [ ]
allTruths_train = [ ]
allTruths_test = [ ]
np.random.seed(SEED)
C = 1

for label_ix in range(nLabels):
    #sys.stdout.write('\r%d / %d' % (label_ix + 1, nLabels))
    #sys.stdout.flush()
    print('\r%d / %d ' % (label_ix + 1, nLabels))
    
    X_train, y_train = create_dataset(label_ix, data = data_train)
    X_test, y_test   = create_dataset(label_ix, data = data_test)
    
    allTruths_train.append(y_train)
    allTruths_test.append(y_test)
    
    assert( (not np.all(y_train == 0)) and (not np.all(y_train == 1)) )
        
    opt_method = 'BFGS' #'Newton-CG' 
    #opt_method = 'nelder-mead'
    options = {'disp': True}
    
    w0 = np.random.rand(X_train.shape[1])  # initial guess
    opt = minimize(obj_exp, w0, args=(X_train, y_train, C), method=opt_method, jac=True, options=options)
    
    if opt.success == True:
        w = opt.x
        params.append(w)
        #allPreds.append(sigmoid(np.dot(X_test, w)))
        allPreds_train.append(np.dot(X_train, w))
        allPreds_test.append(np.dot(X_test, w))
    else:
        sys.stderr.write('Optimisation failed, label_ix=%d\n' % label_ix)
        w = np.zeros(X_train.shape[1])
        params.append(w)
        #allPreds_test.append(np.dot(X_test, w))

In [ ]:
allTruths_train = np.array(allTruths_train).T
allTruths_test = np.array(allTruths_test).T

allPreds_train  = np.array(allPreds_train).T
allPreds_test  = np.array(allPreds_test).T

print(allPreds_test.shape)
print(allTruths_test.shape)

In [ ]:
#allPreds[0]

In [ ]:
print('Training set:')
printEvaluation(allTruths_train, allPreds_train)

In [ ]:
print('Test set:')
printEvaluation(allTruths_test, allPreds_test)

Binary relevance with bipartite ranking

Train a bipartite ranking model for each label.


In [ ]:
#%load_ext Cython

In [ ]:
#%%cython -a

import numpy as np
#cimport numpy as np

#cpdef obj_biranking(w, X, y):

def obj_biranking(w, X, y, C):
    """
        Objective with L2 regularisation and bipartite ranking loss
        
        Input:
            - w: current weight vector
            - X: feature matrix, N x D
            - y: label vector,   N x 1
            - C: regularisation constant
    """
    assert(len(y) == X.shape[0])
    assert(len(w) == X.shape[1])
    assert(C >= 0)

    #cdef int nPos, nNeg, i, j
    #cdef double J, term, denom
    nPos = np.sum(y)      # num of positive examples
    nNeg = len(y) - nPos  # num of negative examples
    
    ixPos = np.nonzero(y)[0].tolist()                    # indices positive examples
    ixNeg = list(set(np.arange(len(y))) - set(ixPos))    # indices negative examples
    
    J = 0.0  # cost
    g = np.zeros_like(w)  # gradient    

    scorePos = X[ixPos, :].dot(w)[:,np.newaxis] # nPos x 1
    scoreNeg = X[ixNeg, :].dot(w)[:,np.newaxis] # nNeg x 1
    scoreDif = scorePos - scoreNeg.T            # nPos x nNeg
    #J = np.mean(np.log(1 + np.exp(-scoreDif)))
    J = 0.5 * C * np.dot(w, w) + np.mean(np.log1p(np.exp(-scoreDif)))
    
    A = -1/(1 + np.exp(scoreDif))

    T1 = X[ixPos, :].T.dot(A.sum(axis = 1))
    T2 = X[ixNeg, :].T.dot(A.sum(axis = 0))
    g  = C * w + 1/(nPos * nNeg) * (T1 - T2)
    
    return (J, g)

Check gradient.


In [ ]:
X_train_, y_train_ = create_dataset(6, data = data_train)

In [ ]:
#%%script false
w0 = w = np.random.rand(X_train_.shape[1])
C = 1
check_grad(lambda w: obj_biranking(w, X_train_, y_train_, C)[0], \
           lambda w: obj_biranking(w, X_train_, y_train_, C)[1], w0)

In [ ]:
#1.1331503772158218e-06 * np.sqrt(nLabels)

In [ ]:
params    = [ ]
allPreds_train  = [ ]
allPreds_test  = [ ]
allTruths_train = [ ]
allTruths_test = [ ]
np.random.seed(SEED)
C = 1

for label_ix in range(nLabels):
    #sys.stdout.write('\r%d / %d' % (label_ix + 1, nLabels))
    #sys.stdout.flush()
    print('\r%d / %d ' % (label_ix + 1, nLabels))
    
    X_train, y_train = create_dataset(label_ix, data = data_train)
    X_test, y_test   = create_dataset(label_ix, data = data_test)
    
    allTruths_train.append(y_train)
    allTruths_test.append(y_test)
    
    assert( (not np.all(y_train == 0)) and (not np.all(y_train == 1)) )
        
    opt_method = 'BFGS' #'Newton-CG' 
    #opt_method = 'nelder-mead'
    options = {'disp': True}
    
    w0 = np.random.rand(X_train.shape[1])  # initial guess
    opt = minimize(obj_biranking, w0, args=(X_train, y_train, C), method=opt_method, jac=True, options=options)
    
    if opt.success == True:
        w = opt.x
        params.append(w)
        #allPreds.append(sigmoid(np.dot(X_test, w)))
        allPreds_train.append(np.dot(X_train, w))
        allPreds_test.append(np.dot(X_test, w))
    else:
        sys.stderr.write('Optimisation failed, label_ix=%d\n' % label_ix)
        w = np.zeros(X_train.shape[1])
        params.append(w)
        allPreds_test.append(np.dot(X_test, w))

In [ ]:
allTruths_train = np.array(allTruths_train).T
allTruths_test = np.array(allTruths_test).T

allPreds_train  = np.array(allPreds_train).T
allPreds_test  = np.array(allPreds_test).T

print(allPreds_test.shape)
print(allTruths_test.shape)

In [ ]:
#allPreds[0]

In [ ]:
print('Training set:')
printEvaluation(allTruths_train, allPreds_train)

In [ ]:
print('Test set:')
printEvaluation(allTruths_test, allPreds_test)

Ranking loss

Multi-label learning with ranking loss.


In [ ]:
def obj_ranking_loop(w, X, Y, C):
    """
        Objective with L2 regularisation and ranking loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x L
            - C: regularisation constant
    """
    N, D = X.shape
    L = Y.shape[1]
    assert(w.shape[0] == L * D)
    
    W = w.reshape(L, D)  # reshape weight matrix    
    
    J = 0.0  # cost
    G = np.zeros_like(W)  # gradient matrix
    
    for n in range(N):
        Jn = 0.0
        Gn = np.zeros_like(W)
        x = X[n, :]
        y = Y[n, :]
        nPos = np.sum(y)   # num of positive examples
        nNeg = L - nPos    # num of negative examples
        denom = nPos * nNeg
        
        ixPos = np.nonzero(y)[0].tolist()               # indices positive examples
        ixNeg = list(set(np.arange(L)) - set(ixPos))    # indices negative examples
        
        for i in ixPos:
            for j in ixNeg:
                wDiff = W[i, :] - W[j, :]
                sDiff = np.dot(wDiff, x)
                term = np.exp(sDiff)
                Jn += np.log1p(1.0 / term)
                Gn[i, :] = Gn[i, :] - x / (1 + term)        
        #for j in ixNeg:
        #    for i in ixPos:
        #        wDiff = W[i, :] - W[j, :]
        #        sDiff = np.dot(wDiff, x)
        #        term = np.exp(sDiff)
                Gn[j, :] = Gn[j, :] + x / (1 + term)
                
        J += Jn / denom
        G = G + Gn / denom
        
    J = 0.5 * C * np.dot(w, w) + J / N
    G = C * W + G / N
    
    return (J, G.ravel())

In [ ]:
#np.tile([1,2,3], (3,1)) * np.array([0.1, 0.2, 0.3])[:, None]

In [ ]:
#np.tile([1,2,3], (3,1)) / np.array([0.1, 0.2, 0.3])[:, None]

In [ ]:
#np.tile([1,2,3], (3,1)) * np.array([0.1, 0.2, 0.3])[:,]

In [ ]:
#np.tile([1,2,3], (3,1)) / np.array([0.1, 0.2, 0.3])[:,]

In [ ]:
def obj_ranking(w, X, Y, C):
    """
        Objective with L2 regularisation and ranking loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x L
            - C: regularisation constant
    """
    N, D = X.shape
    L = Y.shape[1]
    assert(w.shape[0] == L * D)
    
    W = w.reshape(L, D)  # reshape weight matrix    
    
    J = 0.0  # cost
    G = np.zeros_like(W)  # gradient matrix
    
    for n in range(N):
        Jn = 0.0
        Gn = np.zeros_like(W)
        x = X[n, :]
        y = Y[n, :]
        nPos = np.sum(y)   # num of positive examples
        nNeg = L - nPos    # num of negative examples
        denom = nPos * nNeg
        
        ixPos = np.nonzero(y)[0].tolist()               # indices positive examples
        ixNeg = list(set(np.arange(L)) - set(ixPos))    # indices negative examples
        
        ixmat = np.array(list(itertools.product(ixPos, ixNeg)))  # shape: ixPos*ixNeg by 2
        dW = W[ixmat[:, 0], :] - W[ixmat[:, 1], :]
        sVec = np.dot(dW, x)
        Jn = np.sum(np.log1p(np.exp(-sVec)))
        
        coeffVec = np.divide(1, 1 + np.exp(sVec))
        coeffPos = pd.DataFrame(coeffVec)
        coeffPos['gid'] = ixmat[:, 0]
        coeffPos = coeffPos.groupby('gid', sort=False).sum()
        coeffNeg = pd.DataFrame(coeffVec)
        coeffNeg['gid'] = ixmat[:, 1]
        coeffNeg = coeffNeg.groupby('gid', sort=False).sum()
        
        #print(coeffPos)
        #print(coeffNeg)
        
        coeffs = np.ones(L)
        coeffs[ixPos] = -coeffPos.loc[ixPos].values.squeeze()
        coeffs[ixNeg] = coeffNeg.loc[ixNeg].values.squeeze()
        
        #print(coeffs)
        Gn = np.tile(x, (L, 1)) * coeffs[:, None]
                        
        J += Jn / denom
        G = G + Gn / denom
        
    J = 0.5 * C * np.dot(w, w) + J / N
    G = C * W + G / N
    
    return (J, G.ravel())

In [ ]:
X_train, Y_train = create_dataset_v2(data = data_train)
X_test,  Y_test  = create_dataset_v2(data = data_test)

Check gradient


In [ ]:
#%%script false
C = 1
w0 = np.random.rand(nFeatures * nLabels)
check_grad(lambda w: obj_ranking(w, X_train[:10], Y_train[:10], C)[0], \
           lambda w: obj_ranking(w, X_train[:10], Y_train[:10], C)[1], w0)

In [ ]:
allTruths_train = Y_train
allTruths_test = Y_test
allPreds_train = None
allPreds_test  = None
np.random.seed(SEED)

opt_method = 'BFGS' #'Newton-CG' 
#opt_method = 'nelder-mead'
options = {'disp': True}

C = 1
w0 = np.random.rand(nFeatures * nLabels)  # initial guess
opt = minimize(obj_ranking, w0, args=(X_train, Y_train, C), method=opt_method, jac=True, options=options)

if opt.success == True:
    w = opt.x
    #allPreds = sigmoid(np.dot(X_test, w.reshape(nLabels, nFeatures).T))
    allPreds_train = np.dot(X_train, w.reshape(nLabels, nFeatures).T)
    allPreds_test = np.dot(X_test, w.reshape(nLabels, nFeatures).T)
else:
    sys.stderr.write('Optimisation failed')

In [ ]:
print(allPreds_test.shape)
print(allTruths_test.shape)

In [ ]:
print('Training set:')
printEvaluation(allTruths_train, allPreds_train)

In [ ]:
print('Test set:')
printEvaluation(allTruths_test, allPreds_test)

p-classification loss

Multi-label learning with p-norm push loss.


In [ ]:
def obj_pnorm_push_loop(w, X, Y, p, C):
    """
        Objective with L2 regularisation and p-norm push loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x L
            - p: constant for p-norm push loss
            - C: regularisation constant
    """
    N, D = X.shape
    L = Y.shape[1]
    assert(w.shape[0] == L * D)
    assert(p >= 1)
    assert(C >= 0)
    
    W = w.reshape(L, D)  # reshape weight matrix
    
    J = 0.0  # cost
    G = np.zeros_like(W)  # gradient matrix
    
    for n in range(N):
        Gn = np.zeros_like(W)
        x = X[n, :]
        y = Y[n, :]
        nPos = np.sum(y)   # num of positive examples
        nNeg = L - nPos    # num of negative examples
        
        for k in range(L):
            wk = W[k, :]
            term = np.dot(wk, x)
            if y[k] == 1:
                term2 = np.exp(-term) / nPos
                J += term2
                Gn[k, :] = -x * term2
            else:
                term2 = np.exp(p * term) / nNeg
                J += term2 / p
                Gn[k, :] = x * term2
        G = G + Gn
        
    J = 0.5 * C * np.dot(w, w) + J / N
    G = C * W + G / N
    
    return (J, G.ravel())

In [ ]:
def obj_pnorm_push_loopn(w, X, Y, p, C):
    """
        Objective with L2 regularisation and p-norm push loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x L
            - p: constant for p-norm push loss
            - C: regularisation constant
    """
    N, D = X.shape
    L = Y.shape[1]
    assert(w.shape[0] == L * D)
    assert(p >= 1)
    assert(C >= 0)
    
    W = w.reshape(L, D)  # reshape weight matrix
    
    J = 0.0  # cost
    G = np.zeros_like(W)  # gradient matrix
    
    for n in range(N):
        Gn = np.zeros_like(W)
        x = X[n, :]
        y = Y[n, :]
        nPos = np.sum(y)   # num of positive examples
        nNeg = L - nPos    # num of negative examples
        
        ixPos = np.nonzero(y)[0].tolist()               # indices positive examples
        ixNeg = list(set(np.arange(L)) - set(ixPos))    # indices negative examples
        
        scalingPos = np.exp(   -np.dot(W[ixPos, :], x)) / nPos
        scalingNeg = np.exp(p * np.dot(W[ixNeg, :], x)) / nNeg
        
        Gn[ixPos, :] = np.tile(-x, (nPos,1)) * scalingPos[:, None]  # scaling each row of a matrix
        Gn[ixNeg, :] = np.tile( x, (nNeg,1)) * scalingNeg[:, None]  #         with a different scalar
        
        J += np.sum(scalingPos) + np.sum(scalingNeg) / p
        G = G + Gn
        
    J = 0.5 * C * np.dot(w, w) + J / N
    G = C * W + G / N
    
    return (J, G.ravel())

In [ ]:
def obj_pnorm_push(w, X, Y, p, C):
    """
        Objective with L2 regularisation and p-norm push loss
        
        Input:
            - w: current weight vector, flattened L x D
            - X: feature matrix, N x D
            - Y: label matrix,   N x L
            - p: constant for p-norm push loss
            - C: regularisation constant
    """
    N, D = X.shape
    L = Y.shape[1]
    assert(w.shape[0] == L * D)
    assert(p >= 1)
    assert(C >= 0)
    
    W = w.reshape(L, D)  # reshape weight matrix
    
    J = 0.0  # cost
    G = np.zeros_like(W)  # gradient matrix
    
    for k in range(nLabels):
        wk = W[k, :]
        Yk = Y[:, k]
        sPosVec = np.dot(X[Yk == 1, :], wk)      # Nk+ by 1
        sNegVec = np.dot(X[Yk == 0, :], wk)      # NK- by 1
        #nPosVec = np.sum(Y[Yk == 1, :], axis=1)  # Nk+ by 1
        #nNegVec = np.sum(Y[Yk == 0, :], axis=1)  # NK- by 1
        nPosVec = np.sum(Y[Yk == 1, :], axis=1) + 0.1 # Nk+ by 1 with smoothing
        nNegVec = np.sum(Y[Yk == 0, :], axis=1) + 0.1 # NK- by 1 with smoothing
        
        #nPosVec = np.ones_like(sPosVec) * N
        #nNegVec = np.ones_like(sNegVec) * N
        lossPos = np.divide(np.exp(-sPosVec), nPosVec)     # NK+ by 1
        lossNeg = np.divide(np.exp(p * sNegVec), nNegVec)  # NK- by 1
        
        J += np.sum(lossPos) + np.sum(lossNeg / p)
        
        GradPos = -X[Yk == 1, :] * lossPos[:, None]
        GradNeg =  X[Yk == 0, :] * lossNeg[:, None]
        
        G[k, :] = np.sum(GradPos, axis=0) + np.sum(GradNeg, axis=0)
                
    J = 0.5 * C * np.dot(w, w) + J / N
    G = C * W + G / N
    
    return (J, G.ravel())

In [ ]:
X_train, Y_train = create_dataset_v2(data = data_train)
X_test,  Y_test  = create_dataset_v2(data = data_test)

Check gradient


In [ ]:
%%script false
p = 1
C = 1
w0 = np.random.rand(nFeatures * nLabels)
check_grad(lambda w: obj_pnorm_push(w, X_train, Y_train, p, C)[0], \
           lambda w: obj_pnorm_push(w, X_train, Y_train, p, C)[1], w0)

In [ ]:
allTruths_train = Y_train
allTruths_test = Y_test
allPreds_train = None
allPreds_test  = None
np.random.seed(SEED)

p = 1  # [1, 10]
C = 1  # [0, 1]
opt_method = 'BFGS' #'Newton-CG' 
#opt_method = 'nelder-mead'
options = {'disp': True}

w0 = np.random.rand(nFeatures * nLabels)  # initial guess
opt = minimize(obj_pnorm_push, w0, args=(X_train, Y_train, p, C), method=opt_method, jac=True, options=options)

if opt.success == True:
    w = opt.x
    allPreds_train = np.dot(X_train, w.reshape(nLabels, nFeatures).T)
    allPreds_test = np.dot(X_test, w.reshape(nLabels, nFeatures).T)
else:
    sys.stderr.write('Optimisation failed')

In [ ]:
print(allPreds_test.shape)
print(allTruths_test.shape)

In [ ]:
print('Training set:')
printEvaluation(allTruths_train, allPreds_train)

In [ ]:
print('Test set:')
printEvaluation(allTruths_test, allPreds_test)

Results for different hyper-parameter configurations


In [ ]:
#make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
#cross_val_score(clf, iris.data, iris.target, cv=cv)

In [ ]:
#%%script false

precisions_train = dict()
precisions_test = dict()

allTruths_test  = Y_test
allTruths_train = Y_train

p_set = [1, 3, 10, 30]
C_set = [0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000, 3000]
opt_method = 'BFGS' #'Newton-CG' 

for p in p_set:
    for C in C_set:
        print('-------------------------------------')
        print('p in loss: {}, C for regularisation: {}'.format(p, C))
        allPreds       = None
        allPreds_train = None

        w0 = np.random.rand(nFeatures * nLabels)  # initial guess
        opt = minimize(obj_pnorm_push, w0, args=(X_train, Y_train, p, C), method=opt_method, jac=True)

        if opt.success == True:
            w = opt.x
            allPreds_test  = np.dot(X_test,  w.reshape(nLabels, nFeatures).T)
            allPreds_train = np.dot(X_train, w.reshape(nLabels, nFeatures).T)
            precisions_train[(p,C)] = avgPrecisionK(allTruths_train, allPreds_train)
            precisions_test[(p,C)]  = avgPrecisionK(allTruths_test,  allPreds_test)
        else:
            sys.stderr.write('Optimisation failed')
            precisions_train[(p,C)] = 0
            precisions_test[(p,C)]  = 0

        print('%20s %.4f' % ('Average Precision@K on training set: ', precisions_train[(p,C)]))
        print('%20s %.4f\n' % ('Average Precision@K on test set: ', precisions_test[(p,C)]))

In [ ]:
#%%script false
fig = plt.figure(figsize=[10, 8])
ax = plt.subplot(1,1,1)
#colors = itertools.cycle(['r', 'g'])
styles = itertools.cycle(['-', '--', ':', '-.'])
for p in p_set:
    ls = styles.__next__()
    plt.plot(np.arange(len(C_set)), [precisions_train[(p,C)] for C in C_set], \
             ls=ls, c='r', label='p=%d'%p + ', train')
    plt.plot(np.arange(len(C_set)), [precisions_test[(p,C)] for C in C_set], \
             ls=ls, c='g',  label='p=%d'%p + ', test')
plt.plot(np.arange(len(C_set)), [0.5149 for C in C_set], ls='-', c='b', label='Logistic Regression, test')
plt.legend(loc='best')
plt.xticks(np.arange(len(C_set)), C_set, fontsize=10, rotation=0, horizontalalignment='center')
plt.xlabel('Regularisation Constant')
plt.ylabel('Average Precision@K')
plt.title('Performance on Yeast dataset, multi-label learning with p-norm push loss', fontsize=15)
fig.savefig('pnorm.svg')

Top push loss

Methods to compute $v = (\sum_{n=1}^N A_{N \times K})^\top (\sum_{n=1}^N A_{N \times K})$:

  1. Let $\mathbf{x} = (\sum_{n=1}^N A_{N \times K})$, then $v = \mathbf{x}^\top \mathbf{x}$
  2. Let $C_{N \times N} = A_{N \times K} A_{N \times K}^\top$, then $v = \sum_{n=1}^N \sum_{m=1}^N C_{n,m}$,
  3. or $v = \mathbf{1}_N^\top C_{N \times N} \mathbf{1}_N$.

NOTE: if $D_{K \times K} = A_{N \times K}^\top A_{N \times K}$, then $v \ne \sum_{k=1}^K \sum_{l=1}^L D_{K \times K}$, this can be checked with a simple $A_{2x2} = [A_{11}, A_{12}; A_{21}, A_{22}]$ matrix.

Example:


In [ ]:
A = np.random.rand(15).reshape(3,5)
A

In [ ]:
x = np.sum(A, axis=0)
#x

In [ ]:
C = np.dot(A, A.T)
one = np.ones(A.shape[0])

In [ ]:
np.dot(x, x)

In [ ]:
np.sum(np.sum(C))

In [ ]:
np.dot(np.dot(one, C), one)

In [ ]:
D = np.dot(A.T, A)
np.sum(np.sum(D))

Speed test.


In [ ]:
A = np.random.rand(20000000).reshape(10000,2000)
#A = np.random.rand(10000000).reshape(2000,10000)

In [ ]:
%%timeit
x = np.sum(A, axis=0)
np.dot(x, x)

In [ ]:
%%timeit
C = np.dot(A, A.T)
np.sum(np.sum(C))

In [ ]:
%%timeit
C = np.dot(A, A.T)
one = np.ones(A.shape[0])
np.dot(np.dot(one, C), one)