In [ ]:
%matplotlib inline
%load_ext line_profiler
%load_ext autoreload
%autoreload 2
import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.optimize import check_grad
from scipy.special import expit as sigmoid
from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, make_scorer, label_ranking_loss
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed
In [ ]:
sys.path.append('src')
from evaluate import avgPrecisionK, evaluatePrecision, evaluateF1, evaluateRankingLoss, f1_score_nowarn, calcLoss
from datasets import create_dataset, dataset_names, nLabels_dict
In [ ]:
dataset_names
In [ ]:
data_ix = 2
In [ ]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)
In [ ]:
data_dir = 'data'
SEED = 918273645
fmodel_base = os.path.join(data_dir, 'pn-' + dataset_name + '-base.pkl')
fmodel_prec = os.path.join(data_dir, 'pn-' + dataset_name + '-prec.pkl')
fmodel_f1 = os.path.join(data_dir, 'pn-' + dataset_name + '-f1.pkl')
Load data.
In [ ]:
X_train, Y_train = create_dataset(dataset_name, train_data=True, shuffle=True, random_state=SEED)
X_test, Y_test = create_dataset(dataset_name, train_data=False)
Feature normalisation.
In [ ]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test -= X_train_mean
X_test /= X_train_std
In [ ]:
def print_dataset_info(X_train, Y_train, X_test, Y_test):
N_train, D = X_train.shape
K = Y_train.shape[1]
N_test = X_test.shape[0]
print('%-45s %s' % ('Number of training examples:', '{:,}'.format(N_train)))
print('%-45s %s' % ('Number of test examples:', '{:,}'.format(N_test)))
print('%-45s %s' % ('Number of features:', '{:,}'.format(D)))
print('%-45s %s' % ('Number of labels:', '{:,}'.format(K)))
avgK_train = np.mean(np.sum(Y_train, axis=1))
avgK_test = np.mean(np.sum(Y_test, axis=1))
print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (train):', avgK_train, 100*avgK_train / K))
print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (test):', avgK_test, 100*avgK_test / K))
#print('%-45s %.4f%%' % ('Average label occurrence (train):', np.mean(np.sum(Y_train, axis=0)) / N_train))
#print('%-45s %.4f%%' % ('Average label occurrence (test):', np.mean(np.sum(Y_test, axis=0)) / N_test))
print('%-45s %.3f%%' % ('Sparsity (percent) (train):', 100 * np.sum(Y_train) / np.prod(Y_train.shape)))
print('%-45s %.3f%%' % ('Sparsity (percent) (test):', 100 * np.sum(Y_test) / np.prod(Y_test.shape)))
In [ ]:
print('%-45s %s' % ('Dataset:', dataset_name))
print_dataset_info(X_train, Y_train, X_test, Y_test)
Multi-label learning with p-classification loss.
In [ ]:
def obj_pnorm(w, X, Y, C, p, weighting=True):
"""
Objective with L2 regularisation and p-norm push loss
Input:
- w: current weight vector, flattened L x D
- X: feature matrix, N x D
- Y: label matrix, N x L
- C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
- p: constant for p-norm push loss
"""
N, D = X.shape
K = Y.shape[1]
assert(w.shape[0] == K * D)
assert(p >= 1)
assert(C > 0)
W = w.reshape(K, D) # reshape weight matrix
OneN = np.ones(N) # N by 1
OneK = np.ones(K) # K by 1
if weighting is True:
#KPosAll = np.sum(Y, axis=1) # number of positive labels for each example, N by 1
KPosAll = np.dot(Y, OneK)
KNegAll = K - KPosAll # number of negative labels for each example, N by 1
else:
KPosAll = np.ones(N)
KNegAll = np.ones(N)
A_diag = np.divide(1, KPosAll) # N by 1
P_diag = np.divide(1, KNegAll) # N by 1
T1 = np.dot(X, W.T) # N by K
T1p = np.multiply(Y, T1)
T2 = np.multiply(Y, np.exp(-T1p)) # N by K
T3 = T2 * A_diag[:, None] # N by K
T4 = np.dot(T3, OneK) # N by 1
#T1n = np.multiply(1-Y, T1)
T1n = T1 - T1p
T5 = np.multiply(1-Y, np.exp(p * T1n)) # N by K
T6 = T5 * P_diag[:, None] # N by K
T7 = np.dot(T6, OneK) # N by 1
T8 = np.power(T4, p-1) # N by 1
T9 = np.power(T4, p) # N by 1
J = np.dot(w, w) * 0.5 / C + np.dot(OneN, T9 * T7) / N
T10 = T8 * T7 # N by 1
G1 = p * np.dot((T3 * T10[:, None]).T, -X) # K by D
G2 = p * np.dot((T6 * T9[:, None]).T, X) # K by D
G = W / C + (G1 + G2) / N
return (J, G.ravel())
In [ ]:
def obj_pnorm_loop(w, X, Y, C, p, weighting=True):
"""
Objective with L2 regularisation and p-classification loss
Input:
- w: current weight vector, flattened L x D
- X: feature matrix, N x D
- Y: label matrix, N x L
- C: regularisation constant, is consistent with scikit-learn C = 1 / (N * \lambda)
- p: constant for p-classification push loss
"""
N, D = X.shape
K = Y.shape[1]
assert(w.shape[0] == K * D)
assert(p >= 1)
assert(C > 0)
W = w.reshape(K, D) # reshape weight matrix
J = 0.0 # cost
G = np.zeros_like(W) # gradient matrix
if weighting is True:
KPosAll = np.sum(Y, axis=1) # number of positive labels for each example, N by 1
KNegAll = K - KPosAll # number of negative labels for each example, N by 1
else:
KPosAll = np.ones(N)
KNegAll = np.ones(N)
pscores = np.zeros(N)
nscores = np.zeros(N)
for n in range(N):
pscore = 0.0
nscore = 0.0
for k in range(K):
score = np.dot(W[k, :], X[n, :])
if Y[n, k] == 1:
t1 = np.exp(-score) / KPosAll[n]
pscore += t1
else:
t2 = np.exp(p * score) / KNegAll[n]
nscore += t2
pscores[n] = pscore
nscores[n] = nscore
for n in range(N):
for k in range(K):
score = np.dot(W[k, :], X[n, :])
if Y[n, k] == 1:
G[k, :] = G[k,:] - (np.power(pscores[n], p-1) * nscores[n] * np.exp(-score) / KPosAll[n]) * X[n,:]
else:
G[k, :] = G[k,:] + (np.power(pscores[n], p) * np.exp(p * score) / KNegAll[n]) * X[n,:]
J = np.dot(w, w) * 0.5 / C + np.sum(np.power(pscores, p) * nscores) / N
G = W / C + G * p / N
return (J, G.ravel())
Check gradient
In [ ]:
#w0 = 0.001 * np.random.randn(Y_train.shape[1] * X_train.shape[1])
#check_grad(lambda w: obj_pnorm(w, X_train, Y_train, C=1, p=3)[0],
# lambda w: obj_pnorm(w, X_train, Y_train, C=1, p=3)[1], w0)
In [ ]:
#w0 = 0.001 * np.random.randn(Y_train.shape[1] * X_train.shape[1])
#check_grad(lambda w: obj_pnorm_loop(w, X_train, Y_train, C=1, p=2)[0],
# lambda w: obj_pnorm_loop(w, X_train, Y_train, C=1, p=2)[1], w0)
In [ ]:
def cmp_loop_vec(func_loop, func_vec, X_train, Y_train, p=1):
print('%15s %15s %15s %15s %15s' % ('C','J_Diff', 'J_loop', 'J_vec', 'G_Diff'))
w0 = 0.001 * np.random.randn(Y_train.shape[1] * X_train.shape[1])
for e in range(-6, 10):
C = 10**(e)
J, G = func_loop(w0, X_train, Y_train, C, p=p)
J1, G1 = func_vec( w0, X_train, Y_train, C, p=p)
Gdiff = G1 - G
print('%15g %15g %15g %15g %15g' % (C, J1 - J, J, J1, np.dot(Gdiff, Gdiff)))
In [ ]:
#cmp_loop_vec(obj_pnorm_loop, obj_pnorm, X_train, Y_train, p=8)
Line profiling
In [ ]:
#C = 10; p = 2
#w0 = np.random.rand(X_train.shape[1] * nLabels + 1)
#%lprun -f obj_pclassification check_grad(lambda w: obj_pclassification(w, X_train, Y_train, C, p)[0], \
# lambda w: obj_pclassification(w, X_train, Y_train, C, p)[1], w0)
Class definition.
In [ ]:
class MLC_pnorm(BaseEstimator):
"""All methods are necessary for a scikit-learn estimator"""
def __init__(self, C=1, p=1, weighting=True):
"""Initialisation"""
assert C > 0
assert p >= 1
self.C = C
self.p = p
self.weighting = weighting
self.obj_func = obj_pnorm
self.trained = False
def fit(self, X_train, Y_train):
"""Model fitting by optimising the objective"""
opt_method = 'L-BFGS-B' #'BFGS' #'Newton-CG'
options = {'disp': 1, 'maxiter': 10**5, 'maxfun': 10**5} # , 'iprint': 99}
sys.stdout.write('\nC: %g, p: %g, weighting: %s\n' % (self.C, self.p, self.weighting))
sys.stdout.flush()
N, D = X_train.shape
K = Y_train.shape[1]
#w0 = np.random.rand(K * D + 1) - 0.5 # initial guess in range [-1, 1]
w0 = 0.001 * np.random.randn(K * D)
opt = minimize(self.obj_func, w0, args=(X_train, Y_train, self.C, self.p, self.weighting), \
method=opt_method, jac=True, options=options)
if opt.success is True:
self.W = np.reshape(opt.x, (K, D))
self.trained = True
else:
sys.stderr.write('Optimisation failed')
print(opt.items())
self.trained = False
def decision_function(self, X_test):
"""Make predictions (score is real number)"""
assert self.trained is True, "Can't make prediction before training"
D = X_test.shape[1]
return np.dot(X_test, self.W.T) # log of prediction score
def predict(self, X_test):
return self.decision_function(X_test)
# """Make predictions (score is boolean)"""
# preds = sigmoid(self.decision_function(X_test))
# #return (preds >= 0)
# assert self.TH is not None
# return preds >= self.TH
# inherit from BaseEstimator instead of re-implement
#
#def get_params(self, deep = True):
#def set_params(self, **params):
In [ ]:
def avgF1(Y_true, Y_pred):
#THs = [0, 0.05, 0.10, 0.15, 0.2, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70, 0.75] # SPEN THs
THs = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85]
F1 = Parallel(n_jobs=-1)(delayed(f1_score_nowarn)(Y_true, Y_pred >= th, average='samples') for th in THs)
bestix = np.argmax(F1)
print('best threshold: %g, best F1: %g, #examples: %g' % (THs[bestix], F1[bestix], Y_true.shape[0]))
return F1[bestix]
In [ ]:
def avgF1_0(Y_true, Y_pred):
F1 = f1_score_nowarn(Y_true, Y_pred >= 0, average='samples')
print('F1: %g, #examples: %g' % (F1, Y_true.shape[0]))
return F1
In [ ]:
clf = MLC_pnorm(C=100, p=6, weighting=True)
clf.fit(X_train, Y_train)
print(avgF1(Y_train, clf.decision_function(X_train)))
print(avgF1(Y_test, clf.decision_function(X_test)))
In [ ]:
C_set = [0.01, 0.1, 1, 10, 100, 1000] # bibtex, bookmarks level 1
p_set = [1, 2, 3, 4, 5, 6]
parameters = [{'C': C_set, 'p': p_set, 'weighting': [True]}]
#scorer = {'Prec': make_scorer(avgPrecisionK)}
scorer = {'F1': make_scorer(avgF1)}
In [ ]:
clf = GridSearchCV(MLC_pnorm(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='F1')
clf.fit(X_train, Y_train)
#pkl.dump(clf, open(fmodel_f1, 'wb'))
In [ ]:
clf.cv_results_['mean_test_F1'].reshape(len(C_set), len(p_set))
In [ ]:
clf.best_params_
In [ ]:
C_set = [100, 300, 600, 900, 1000] # bibtex level 2
p_set = [3, 4, 5, 6, 7, 8]
parameters = [{'C': C_set, 'p': p_set, 'weighting': [True]}]
In [ ]:
clf2 = GridSearchCV(MLC_pnorm(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='F1')
clf2.fit(X_train, Y_train)
#pkl.dump(clf, open(fmodel_f1, 'wb'))
In [ ]:
clf2.cv_results_['mean_test_F1'].reshape(len(C_set), len(p_set))
In [ ]:
clf2.best_params_
In [ ]:
C_set = []
p_set = [4, 5, 6, 7, 8]
parameters = [{'C': C_set, 'p': p_set, 'weighting': [True]}]
In [ ]:
clf3 = GridSearchCV(MLC_pnorm(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='F1')
clf3.fit(X_train, Y_train)
#pkl.dump(clf, open(fmodel_f1, 'wb'))
In [ ]:
clf3.cv_results_['mean_test_F1'].reshape(len(C_set), len(p_set))
In [ ]:
clf3.best_params_
In [ ]:
print(avgF1(Y_train, clf2.decision_function(X_train)))
print(avgF1(Y_test, clf2.decision_function(X_test)))
In [ ]:
pkl.dump(clf2, open(fmodel_f1, 'wb'))
In [ ]:
print('Train (' + dataset_name + '):', avgF1(Y_train, clf2.decision_function(X_train))); print()
print('Test (' + dataset_name + '):', avgF1(Y_test, clf2.decision_function(X_test)))
In [ ]:
# use the testing threshold of the best hyper-params in cross validation above
# it is 0.8 for both bibtex and bookmarks dataset
threshold = 0.7
print('average F1:', f1_score_nowarn(Y_test, clf2.decision_function(X_test) >= threshold, average='samples'))
In [ ]:
for C in np.arange(1, 10):
fname = os.path.join(data_dir, 'pnorm/pn-bookmarks-%g-f1.pkl' % C)
pn = pkl.load(open(fname, 'rb'))
print('C: %g' % C)
print(pn.cv_results_['mean_test_F1'])
input('Press any key to continue')
In [ ]: