Multi-label classification -- p-classification loss check grad


In [ ]:
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

import os, sys, time
import numpy as np

from scipy.optimize import check_grad
from scipy.sparse import issparse, csc_matrix, csr_matrix

In [ ]:
sys.path.append('src/')
sys.path.append('src/models')
#from MLC import objective, risk_pclassification, DataHelper
#from NSR import objective_clf, risk_pclassification, DataHelper, obj_clf_loop
#from MTC import objective, risk_pclassification, DataHelper
from MTC_L1 import objective, risk_pclassification, DataHelper
from tools import create_dataset, dataset_names, nLabels_dict

In [ ]:
dataset_names

In [ ]:
data_ix = 0

In [ ]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)

In [ ]:
data_dir = 'data'
SEED = 918273645

Load data.


In [ ]:
X_train, Y_train = create_dataset(dataset_name, train_data=True, shuffle=True, random_state=SEED)
X_test,  Y_test  = create_dataset(dataset_name, train_data=False)

Feature normalisation.


In [ ]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test  -= X_train_mean
X_test  /= X_train_std

In [ ]:
def print_dataset_info(X_train, Y_train, X_test, Y_test):
    N_train, D = X_train.shape
    K = Y_train.shape[1]
    N_test = X_test.shape[0]
    print('%-45s %s' % ('Number of training examples:', '{:,}'.format(N_train)))
    print('%-45s %s' % ('Number of test examples:', '{:,}'.format(N_test)))
    print('%-45s %s' % ('Number of features:', '{:,}'.format(D)))
    print('%-45s %s' % ('Number of labels:', '{:,}'.format(K)))
    avgK_train = np.mean(np.sum(Y_train, axis=1))
    avgK_test  = np.mean(np.sum(Y_test, axis=1))
    print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (train):', avgK_train, 100*avgK_train / K))
    print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (test):', avgK_test, 100*avgK_test / K))
    #print('%-45s %.4f%%' % ('Average label occurrence (train):', np.mean(np.sum(Y_train, axis=0)) / N_train))
    #print('%-45s %.4f%%' % ('Average label occurrence (test):', np.mean(np.sum(Y_test, axis=0)) / N_test))
    print('%-45s %.3f%%' % ('Sparsity (percent) (train):', 100 * np.sum(Y_train) / np.prod(Y_train.shape)))
    print('%-45s %.3f%%' % ('Sparsity (percent) (test):', 100 * np.sum(Y_test) / np.prod(Y_test.shape)))

In [ ]:
print('%-45s %s' % ('Dataset:', dataset_name))
print_dataset_info(X_train, Y_train, X_test, Y_test)

check gradient.


In [ ]:
%%script false
PU = np.zeros((Y_train.shape[0], 3), dtype=Y_train.dtype)
PU[[0, 1, 2, 10], [0, 1, 1, 2]] = 1
upl_ix = [[2, 3, 4], [5, 6, 7, 8, 9], [10, 11], [12, 13, 14, 15]]
w0 = 0.001 * np.random.randn((Y_train.shape[1] + 3) * X_train.shape[1] + 1)
loss = 'both'
check_grad(\
lambda w: obj_pclassification(w, X_train, Y_train, C1=10, C2=1, C3=2, p=3, loss_type=loss,
                              PU=PU, user_playlist_indices=upl_ix)[0], 
lambda w: obj_pclassification(w, X_train, Y_train, C1=10, C2=1, C3=2, p=3, loss_type=loss,
                              PU=PU, user_playlist_indices=upl_ix)[1],w0)

In [ ]:
%%script false
cliques = [[2, 3, 4], [5, 6, 7, 8, 9], [10, 11], [12, 13]]
#cliques = None
w0 = 0.001 * np.random.randn(Y_train.shape[1] * X_train.shape[1] + 1)
#w0 = np.zeros(Y_train.shape[1] * X_train.shape[1] + 1)
dw = np.zeros_like(w0)
loss = 'example'
bs=5 if loss == 'label' else 100
Y_train = csr_matrix(Y_train)
data_helper_example = None if loss == 'label' else DataHelper(Y_train, ax=0, batch_size=bs)
data_helper_label = None if loss == 'example' else DataHelper(Y_train, ax=1, batch_size=bs)
#%lprun -f accumulate_risk \
#%lprun -f objective \
check_grad(lambda w: objective(w, dw, X_train, Y_train, C1=10, C2=1, C3=2, p=3, loss_type=loss, cliques=cliques, \
                               data_helper_example=data_helper_example, data_helper_label=data_helper_label), \
           lambda w: dw, w0)

In [ ]:
%%script false
cliques = [[2, 3, 4], [5, 6, 7, 8, 9], [10, 11], [12, 13]]
#cliques = None
w0 = 0.001 * np.random.randn(Y_train.shape[1] * (X_train.shape[1] + 1))
#w0 = np.zeros(Y_train.shape[1] * (X_train.shape[1] + 1))
dw = np.zeros_like(w0)
bs=5
Y_train = csr_matrix(Y_train)
data_helper = DataHelper(Y_train, ax=1, batch_size=bs)
#%lprun -f accumulate_risk \
#%lprun -f objective \
check_grad(lambda w: objective(w, dw, X_train, Y_train, C1=10, C3=2, p=3, \
                               cliques=cliques, data_helper=data_helper), \
           lambda w: dw, w0)

In [ ]:
def check_grad_loop(obj, grad, w0):
    eps = 1.49e-08
    w = np.zeros_like(w0)
    for i in range(len(w0)):
        if (i+1) % 10 == 0:
            sys.stdout.write('\r%d / %d' % (i+1, len(w0)))
        wi1 = w0.copy()
        wi2 = w0.copy()
        wi1[i] = wi1[i] - eps
        wi2[i] = wi2[i] + eps
        J1 = obj(wi1)
        J2 = obj(wi2)
        w[i] = (J2 - J1) / (2 * eps)
        #print(w[i])
    w1 = grad(w0)
    diff = w1 - w
    return np.sqrt(np.dot(diff, diff))

In [ ]:
%%script false
# objective(w, dw, X, Y, C, p, cliques, data_helper, UF=None, njobs=1, verbose=0, fnpy=None)

cliques = [[0], [1], [2, 3, 4], [5, 6, 7, 8, 9], [10, 11], [12, 13]]
#cliques = [[0], [1, 2], [3, 4, 5]]
Ycsc = csc_matrix(Y_train)
data_helper = DataHelper(Ycsc, cliques)

UF = np.zeros((X_train.shape[0], len(cliques)))
for u in range(len(cliques)):
    clq = cliques[u]
    UF[:, u] = Ycsc[:, clq].sum(axis=1).A.reshape(-1)
    UF_mean = np.mean(UF, axis=0).reshape((1, -1))
    UF_std = np.std(UF, axis=0).reshape((1, -1)) + 10 ** (-6)
    UF -= UF_mean
    UF /= UF_std
w0 = 0.001 * np.random.randn((len(cliques) + Y_train.shape[1] + 1) * (X_train.shape[1] + len(cliques) - 1))

#UF = None
#w0 = 0.001 * np.random.randn((len(cliques) + Y_train.shape[1] + 1) * X_train.shape[1])

dw = np.zeros_like(w0)

#%lprun -f accumulate_risk \
#%lprun -f objective \
check_grad(lambda w: objective(w, dw, X_train, Y_train, C=10, p=3, cliques=cliques, \
                               data_helper=data_helper, UF=UF), \
           lambda w: dw, w0)

In [ ]:
%%script false
check_grad_loop(lambda w: objective(w, dw, X_train, Y_train, C=10, p=3, cliques=cliques, \
                                    data_helper=data_helper, UF=UF), \
                lambda w: dw, w0)

In [ ]:
#%%script false
# objective(w, dw, X, Y, p, cliques, data_helper, verbose=0, fnpy=None)

cliques = [[0], [1], [2, 3, 4], [5, 6, 7, 8, 9], [10, 11], [12, 13]]
#cliques = [[0], [1, 2], [3, 4, 5]]
Ycsc = csc_matrix(Y_train)
data_helper = DataHelper(Ycsc, cliques)

w0 = 0.001 * np.random.randn((len(cliques) + Y_train.shape[1] + 1) * X_train.shape[1])
dw = np.zeros_like(w0)

#%lprun -f accumulate_risk \
#%lprun -f objective \
check_grad(lambda w: objective(w, dw, X_train, Y_train, p=6, cliques=cliques, data_helper=data_helper), \
           lambda w: dw, w0)

In [ ]:
import gzip
import pickle as pkl
pkldir = 'data/%s/setting1' % dataset_name
pkl.dump(X_train, gzip.open(os.path.join(pkldir, 'X_train.pkl.gz'), 'wb'))
pkl.dump(csc_matrix(Y_train), gzip.open(os.path.join(pkldir, 'Y_train.pkl.gz'), 'wb'))
pkl.dump(cliques, gzip.open(os.path.join(pkldir, 'cliques_train.pkl.gz'), 'wb'))
pkl.dump(X_test, gzip.open(os.path.join(pkldir, 'X_dev.pkl.gz'), 'wb'))
pkl.dump(csc_matrix(Y_test), gzip.open(os.path.join(pkldir, 'Y_dev.pkl.gz'), 'wb'))
pkl.dump(cliques, gzip.open(os.path.join(pkldir, 'cliques_trndev.pkl.gz'), 'wb'))

In [ ]:
dataset_name

In [ ]:
%%script false
print('%-20s  %-20s' % ('J1-J2', '|G1-G2|'))
#cliques = [[0], [1], [2, 3, 4], [5, 6, 7, 8, 9], [10, 11], [12, 13]]
cliques = [np.arange(Y_train.shape[1])]
Y_train = csc_matrix(Y_train)
data_helper = DataHelper(Y_train, cliques)
for i in range(10):
    w0 = 0.001 * np.random.randn((len(cliques) + Y_train.shape[1] + 1) * X_train.shape[1])
    dw1 = np.zeros_like(w0)
    dw2 = np.zeros_like(w0)
    J1 = objective_clf(w0, dw1, X_train, Y_train, C=10, p=3, cliques=cliques, data_helper=data_helper)
    J2 = obj_clf_loop(w0, dw2, X_train, Y_train, C=10, p=3, cliques=cliques, data_helper=data_helper)
    print('%-20s  %-20s' % ('%g' % (J1-J2), '%g' % np.sqrt(np.dot(dw1-dw2, dw1-dw2))))

In [ ]:
import numpy as np
import sys
from scipy.optimize import check_grad

#m, n, d = 100, 50, 20
m, n, d = 1500, 14, 103
X = np.random.randn(m, d)
w0 = 0.001 * np.random.randn(n, m).reshape(-1)

def obj(w):
    assert w.shape == (n * m,)
    W = w.reshape(n, m)
    T = W.sum(axis=0)
    return T.dot(X).dot(X.T).dot(T) / 2

def grad(w):
    assert w.shape == (n * m,)
    W = w.reshape(n, m)
    T = W.sum(axis=0)
    Tg = X.dot(X.T).dot(T).reshape(-1)
    return np.tile(Tg, (n, 1)).ravel()

from ad import gh
jac, hessian = gh(obj)

In [ ]:
check_grad(obj, jac, w0)

In [ ]:
check_grad(obj, grad, w0)

In [ ]:
check_grad_loop(obj, grad, w0)