In [1]:
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload
%autoreload 2
import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.optimize import check_grad
from scipy.special import logsumexp
from scipy.special import expit as sigmoid
from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score, make_scorer, label_ranking_loss
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import seaborn as sns
from joblib import Parallel, delayed
In [32]:
sys.path.append('src')
from evaluate import avgPrecisionK, evaluatePrecision, evaluateF1, evaluateRankingLoss, f1_score_nowarn, calcLoss
from datasets import create_dataset, dataset_names, nLabels_dict
from util import plot_loss
In [3]:
dataset_names
Out[3]:
In [4]:
data_ix = 2
In [5]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)
In [6]:
data_dir = 'data'
SEED = 918273645
fmodel_base = os.path.join(data_dir, 'bp-' + dataset_name + '-base.pkl')
fmodel_prec = os.path.join(data_dir, 'bp-' + dataset_name + '-prec.pkl')
fperf_base = os.path.join(data_dir, 'perf-bp-base.pkl')
fperf_prec = os.path.join(data_dir, 'perf-bp-prec.pkl')
Load dataset.
In [7]:
X_train, Y_train = create_dataset(dataset_name, train_data=True, shuffle=True, random_state=SEED)
X_test, Y_test = create_dataset(dataset_name, train_data=False)
Feature normalisation.
In [8]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test -= X_train_mean
X_test /= X_train_std
In [9]:
def print_dataset_info(X_train, Y_train, X_test, Y_test):
N_train, D = X_train.shape
K = Y_train.shape[1]
N_test = X_test.shape[0]
print('%-45s %s' % ('Number of training examples:', '{:,}'.format(N_train)))
print('%-45s %s' % ('Number of test examples:', '{:,}'.format(N_test)))
print('%-45s %s' % ('Number of features:', '{:,}'.format(D)))
print('%-45s %s' % ('Number of labels:', '{:,}'.format(K)))
avgK_train = np.mean(np.sum(Y_train, axis=1))
avgK_test = np.mean(np.sum(Y_test, axis=1))
print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (train):', avgK_train, 100*avgK_train / K))
print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (test):', avgK_test, 100*avgK_test / K))
#print('%-45s %.4f%%' % ('Average label occurrence (train):', np.mean(np.sum(Y_train, axis=0)) / N_train))
#print('%-45s %.4f%%' % ('Average label occurrence (test):', np.mean(np.sum(Y_test, axis=0)) / N_test))
print('%-45s %.3f%%' % ('Sparsity (percent) (train):', 100 * np.sum(Y_train) / np.prod(Y_train.shape)))
print('%-45s %.3f%%' % ('Sparsity (percent) (test):', 100 * np.sum(Y_test) / np.prod(Y_test.shape)))
In [10]:
print('%-45s %s' % ('Dataset:', dataset_name))
print_dataset_info(X_train, Y_train, X_test, Y_test)
Multi-label learning with bottom push loss.
In [11]:
def obj_bottompush(w, X, Y, C, r=1, weighting=True):
"""
Objective of bottom push loss for examples
Input:
- w: current weight vector, flattened L x D
- X: feature matrix, N x D
- Y: label matrix, N x K
- r: parameter for log-sum-exp approximation
- weighting: if True, divide K+ in top-push loss
"""
N, D = X.shape
K = Y.shape[1]
assert(w.shape[0] == K * D)
assert(r > 0)
W = w.reshape(K, D) # theta
J = 0.0 # cost
G = np.zeros_like(W) # gradient matrix
if weighting is True:
KNegAll = np.sum(1-Y, axis=1) # number of negative labels for each example, N by 1
else:
KNegAll = np.ones(N)
A1_diag = 1.0 / KNegAll
A1Y = (1-Y) * A1_diag[:, None]
T1 = np.dot(X, W.T) # N by K
m0 = 0.5 * (np.max(T1) + np.min(T1))
m1 = r * m0
#print('----------------')
#print(np.min(T1), np.max(T1), m0)
#print(np.min(r*T1), np.max(r*T1), m1)
#print(np.min(r * T1 - m1), np.max(r * T1 - m1))
T2 = np.multiply(Y, np.exp(-r * T1 + m1)) # N by K
B1_tilde_diag = np.dot(T2, np.ones(K))
#print(np.max(B_tilde_diag), np.min(B_tilde_diag)) # big numbers here, can cause overflow in T3
#T3 = np.exp(-T1 + m0) * np.power(B_tilde_diag, 1.0 / r)[:, None]
#T4 = np.multiply(AY, np.log1p(T3))
T3 = (T1 - m0) + (1.0 / r) * np.log(B1_tilde_diag)[:, None]
#print(np.min(T3), np.max(T3))
m2 = 0.5 * (np.min(T3) + np.max(T3))
#T4 = np.logaddexp(0, T3)
T4 = np.logaddexp(-m2, T3-m2) + m2
T5 = np.multiply(A1Y, T4)
J = np.dot(w, w) * 0.5 / C + np.dot(np.ones(N), np.dot(T5, np.ones(K))) / N
#T5 = 1.0 / (1.0 + np.divide(1.0, T3))
#T5 = np.divide(T3, 1 + T3)
T6 = np.exp(T3 - T4)
G1 = np.dot(np.multiply(A1Y, T6).T, X)
#print(np.max(T2), np.min(T2), np.max(T7), np.min(T7))
O1_diag = np.dot(np.multiply(1-Y, T6), np.ones(K))
T7 = A1_diag * (1.0 / B1_tilde_diag) * O1_diag
T8 = T2 * T7[:, None]
G2 = np.dot(T8.T, -X)
G = W / C + (G1 + G2) / N
return (J, G.ravel())
In [12]:
def obj_bottompush_loop(w, X, Y, C=1, r=1, weighting=True):
"""
Objective of bottom push loss for examples
Input:
- w: current weight vector, flattened L x D
- X: feature matrix, N x D
- Y: label matrix, N x K
- C: regularisation constant, C = 1 / lambda
- r: parameter for log-sum-exp approximation
"""
N, D = X.shape
K = Y.shape[1]
assert(w.shape[0] == K * D)
assert(r > 0)
W = w.reshape(K, D) # theta
J = 0.0 # cost
G = np.zeros_like(W) # gradient matrix
if weighting is True:
KNegAll = np.sum(1-Y, axis=1) # number of negative labels for each example, N by 1
else:
KNegAll = np.ones(N)
for n in range(N):
for k in range(K):
if Y[n, k] == 0:
s1 = np.sum([np.exp(-r * np.dot(W[i, :] - W[k, :], X[n, :])) for i in range(K) if Y[n, i] == 1])
J += np.log1p(np.power(s1, 1.0 / r)) / KNegAll[n]
J = np.dot(w, w) * 0.5 / C + J / N
for k in range(K):
for n in range(N):
if Y[n, k] == 0:
t1 = np.sum([np.exp(-r * np.dot(W[i, :] - W[k, :], X[n, :])) for i in range(K) if Y[n, i] == 1])
t2 = 1.0 / (1 + np.power(t1, -1.0 / r))
G[k, :] = G[k, :] + X[n, :] * t2 / KNegAll[n]
else:
sk = 0.0
for k1 in range(K):
if Y[n, k1] == 0:
t3 = np.sum([np.exp(-r * np.dot(W[i,:] - W[k1, :], X[n, :])) \
for i in range(K) if Y[n, i] == 1])
t4 = np.exp(-r * np.dot(W[k, :] - W[k1, :], X[n, :]))
sk += t4 / (np.power(t3, 1.0 - 1.0 / r) + t3)
G[k, :] = G[k, :] - X[n, :] * sk / KNegAll[n]
G = W / C + G / N
return (J, G.ravel())
Check gradient
In [13]:
#w0 = 0.001 * np.random.randn(Y_train.shape[1] * X_train.shape[1])
#check_grad(lambda w: obj_bottompush(w, X_train, Y_train, C=10, r=8)[0],
# lambda w: obj_bottompush(w, X_train, Y_train, C=10, r=8)[1], w0)
In [14]:
def cmp_loop_vec(func_loop, func_vec, X_train, Y_train, r=8):
print('%15s %15s %15s %15s %15s' % ('C','J_Diff', 'J_loop', 'J_vec', 'G_Diff'))
w0 = 0.001 * np.random.randn(Y_train.shape[1] * X_train.shape[1])
for e in range(-6, 10):
C = 10**(e)
#w0 = init_var(X_train, Y_train)
J, G = func_loop(w0, X_train, Y_train, C, r=r)
J1, G1 = func_vec(w0, X_train, Y_train, C, r=r)
Gdiff = G1 - G
#print('%-15g %-15g %-15g' % (J1 - J, J, J1))
print('%15g %15g %15g %15g %15g' % (C, J1 - J, J, J1, np.dot(Gdiff, Gdiff)))
In [15]:
def check_grad_loop(func, X_train, Y_train, r=4):
w0 = 0.001 * np.random.randn(Y_train.shape[1] * X_train.shape[1])
eps = 1.49e-08
w = np.zeros_like(w0)
for i in range(len(w0)):
sys.stdout.write('\r%d / %d' % (i+1, len(w0)))
wi1 = w0.copy()
wi2 = w0.copy()
wi1[i] = wi1[i] - eps
wi2[i] = wi2[i] + eps
J1, _ = func(wi1, X_train, Y_train, r=r)
J2, _ = func(wi2, X_train, Y_train, r=r)
w[i] = (J2 - J1) / (2 * eps)
#print(w[i])
J, w1 = obj_toppush_loop(w0, X_train, Y_train, C)
diff = w1 - w
return np.sqrt(np.dot(diff, diff))
In [16]:
#cmp_loop_vec(obj_bottompush_loop, obj_bottompush, X_train, Y_train)
In [26]:
class MLC_bottompush(BaseEstimator):
"""All methods are necessary for a scikit-learn estimator"""
def __init__(self, C=1, r=1, weighting=True):
"""Initialisation"""
assert C > 0
assert r > 0
assert type(weighting) == bool
self.C = C
self.r = r
self.weighting = weighting
self.obj_func = obj_bottompush
self.trained = False
def fit(self, X_train, Y_train):
"""Model fitting by optimising the objective"""
opt_method = 'L-BFGS-B' #'BFGS' #'Newton-CG'
options = {'disp': 1, 'maxiter': 10**5, 'maxfun': 10**5} # , 'iprint': 99}
print('\nC: %g, r: %g, weighting: %s' % (self.C, self.r, self.weighting))
N, D = X_train.shape
K = Y_train.shape[1]
#w0 = np.random.rand(K * D) - 0.5 # initial guess in range [-1, 1]
w0 = 0.001 * np.random.randn(K * D)
opt = minimize(self.obj_func, w0, args=(X_train, Y_train, self.C, self.r, self.weighting), \
method=opt_method, jac=True, options=options)
if opt.success is True:
self.W = np.reshape(opt.x, (K, D))
self.trained = True
else:
sys.stderr.write('Optimisation failed')
print(opt.items())
self.trained = False
def decision_function(self, X_test):
"""Make predictions (score is real number)"""
assert self.trained is True, "Can't make prediction before training"
D = X_test.shape[1]
return np.dot(X_test, self.W.T)
def predict(self, X_test):
return self.decision_function(X_test)
# """Make predictions (score is boolean)"""
# preds = sigmoid(self.decision_function(X_test))
# #return (preds >= 0)
# assert self.TH is not None
# return preds >= self.TH
# inherit from BaseEstimator instead of re-implement
#
#def get_params(self, deep = True):
#def set_params(self, **params):
In [18]:
def dump_results(predictor, X_train, Y_train, X_test, Y_test, fname, rankingLoss=False):
"""
Compute and save performance results
"""
preds_train = predictor.decision_function(X_train)
preds_test = predictor.decision_function(X_test)
print('Training set:')
perf_dict_train = evaluatePrecision(Y_train, preds_train, verbose=1)
print()
print('Test set:')
perf_dict_test = evaluatePrecision(Y_test, preds_test, verbose=1)
if rankingLoss is True:
print()
print('Training set:')
perf_dict_train.update(evaluateRankingLoss(Y_train, preds_train))
print(label_ranking_loss(Y_train, preds_train))
print()
print('Test set:')
perf_dict_test.update(evaluateRankingLoss(Y_test, preds_test))
print(label_ranking_loss(Y_test, preds_test))
# compute F1 score w.r.t. different thresholds
#TH1 = predictor.cv_results_['mean_test_TH'][clf.best_index_]
#TH2 = np.mean(Y_train, axis=0)
#TH3 = np.mean(TH2)
#preds_train_bin = sigmoid(preds_train)
#preds_test_bin = sigmoid(preds_test)
#F1_train1 = f1_score_nowarn(Y_train, sigmoid(preds_train) >= TH1, average='samples')
#F1_test1 = f1_score_nowarn(Y_test, sigmoid(preds_test) >= TH1, average='samples')
#print('\nTrain: %.4f, %f' % (F1_train1, f1_score(Y_train, sigmoid(preds_train) >= TH1, average='samples')))
#print('\nTest : %.4f, %f' % (F1_test1, f1_score(Y_test, sigmoid(preds_test) >= TH1, average='samples')))
#F1_train2 = f1_score_nowarn(Y_train, (preds_train_bin - TH2) >= 0, average='samples')
#F1_test2 = f1_score_nowarn(Y_test, (preds_test_bin - TH2) >= 0, average='samples')
#print('\nTrain: %.4f, %f' % (F1_train2, f1_score(Y_train, (preds_train_bin - TH2) >= 0, average='samples')))
#print('\nTest : %.4f, %f' % (F1_test2, f1_score(Y_test, (preds_test_bin - TH2) >= 0, average='samples')))
#F1_train3 = f1_score_nowarn(Y_train, preds_train_bin >= TH3, average='samples')
#F1_test3 = f1_score_nowarn(Y_test, preds_test_bin >= TH3, average='samples')
#print('\nTrain: %.4f, %f' % (F1_train3, f1_score(Y_train, preds_train_bin >= TH3, average='samples')))
#print('\nTest : %.4f, %f' % (F1_test3, f1_score(Y_test, preds_test_bin >= TH3, average='samples')))
#perf_dict_train.update({'F1': [(F1_train1,), (F1_train2,), (F1_train3,)]})
#perf_dict_test.update( {'F1': [(F1_test1,), (F1_test2,), (F1_test3,)]})
#perf_dict_train.update({'F1': [(F1_train2,), (F1_train3,)]})
#perf_dict_test.update( {'F1': [(F1_test2,), (F1_test3,)]})
perf_dict = {'Train': perf_dict_train, 'Test': perf_dict_test}
if os.path.exists(fname):
_dict = pkl.load(open(fname, 'rb'))
if dataset_name not in _dict:
_dict[dataset_name] = perf_dict
else:
_dict = {dataset_name: perf_dict}
pkl.dump(_dict, open(fname, 'wb'))
print()
print(pkl.load(open(fname, 'rb')))
In [19]:
old_settings = np.seterr(all='ignore') # seterr to known value
np.seterr(all='raise')
#np.seterr(all='ignore')
#np.seterr(**old_settings) # restore settings
Out[19]:
In [20]:
#%memit model.fit(X_train[:30], Y_train[:30])
#%mprun -f minimize model.fit(X_train[:100], Y_train[:100])
#%mprun -f _minimize_slsqp model.fit(X_train[:10], Y_train[:10])
Default model.
In [21]:
%%script false
if os.path.exists(fmodel_base):
clf = pkl.load(open(fmodel_base, 'rb'))
else:
clf = clf = MLC_hybrid()
clf.fit(X_train, Y_train)
pkl.dump(clf, open(fmodel_base, 'wb'))
In [22]:
#dump_results(clf, X_train, Y_train, X_test, Y_test, fperf_base)
Cross validation w.r.t. average precision@K.
In [23]:
#ranges = range(-6, 7)
#ranges = range(-6, 5)
#parameters = [{'C': sorted([10**(e) for e in ranges] + [3 * 10**(e) for e in ranges]),
parameters = [{'C': [1e-3, 3e-3, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1e3, 3e3],
'r': [8],
'weighting': [True, False],
}]
scorer = {'Prec': make_scorer(avgPrecisionK)}
In [27]:
if not os.path.exists(fmodel_prec):
clf = GridSearchCV(MLC_bottompush(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='Prec')
clf.fit(X_train, Y_train)
#pkl.dump(clf, open(fmodel_prec, 'wb'))
else:
clf = pkl.load(open(fmodel_prec, 'rb'))
In [28]:
dump_results(clf, X_train, Y_train, X_test, Y_test, fperf_prec, rankingLoss=True)
In [ ]:
pkl.dump(clf, open(fmodel_prec, 'wb'))
In [29]:
preds_train = clf.decision_function(X_train)
tploss_train = calcLoss(Y_train, preds_train, 'TopPush', njobs=4)
pak_train = calcLoss(Y_train, preds_train, 'Precision@K', njobs=4)
In [30]:
preds_test = clf.decision_function(X_test)
tploss_test = calcLoss(Y_test, preds_test, 'TopPush', njobs=4)
pak_test = calcLoss(Y_test, preds_test, 'Precision@K', njobs=4)
In [33]:
plot_loss(tploss_train, pak_train, 'Training set (' + dataset_name + ')')
In [ ]:
tploss_train.shape
In [ ]:
np.mean(tploss_train != 1-pak_train)
In [ ]:
tploss_test.shape
In [34]:
plot_loss(tploss_test, pak_test, 'Test set (' + dataset_name + ')')
In [ ]:
clf2 = MLC_hybrid(C=300, C1=2, r=8, weighting=True)
clf2.fit(X_train, Y_train)
In [ ]:
dump_results(clf2, X_train, Y_train, X_test, Y_test, fperf_prec, rankingLoss=False)
In [ ]:
#f1_score_nowarn(Y_test, clf.decision_function(X_test) > 0, average='samples')