In [ ]:
%matplotlib inline
import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn as sk
import cython
import itertools
from scipy.io import arff
from scipy.optimize import minimize
from scipy.optimize import check_grad
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
data_dir = 'data'
#yeast_ftrain = os.path.join(data_dir, 'yeast/yeast-train.arff')
#yeast_ftest = os.path.join(data_dir, 'yeast/yeast-test.arff')
#bibtex_ftrain = os.path.join(data_dir, 'bibtex/bibtex-train.arff')
#bibtex_ftest = os.path.join(data_dir, 'bibtex/bibtex-test.arff')
#fbookmarks = os.path.join(data_dir, 'bookmarks/bookmarks.arff')
mm_ftrain = os.path.join(data_dir, 'mediamill/mediamill-train.arff')
mm_ftest = os.path.join(data_dir, 'mediamill/mediamill-test.arff')
In [ ]:
SEED = 123456789
Load yeast dataset.
In [ ]:
#data_train, meta_train = arff.loadarff(yeast_ftrain)
#data_train, meta_train = arff.loadarff(open(bibtex_ftrain))
#data_bookmarks = arff.loadarff(open(fbookmarks))
data_train, meta_train = arff.loadarff(mm_ftrain)
In [ ]:
#data_test, meta_test = arff.loadarff(yeast_ftest)
#data_test, meta_test = arff.loadarff(bibtex_ftest)
data_test, meta_test = arff.loadarff(mm_ftest)
In [ ]:
type(data_train)
In [ ]:
print(data_train[0])
In [ ]:
len(list(data_train[0]))
In [ ]:
len(list(data_train[0])[120:])
In [ ]:
len(list(data_train[0])[:120])
Features
In [ ]:
#nFeatures = np.array(list(data_train[0])[:-14], dtype=np.float).shape[0]
nFeatures = np.array(list(data_train[0])[:120], dtype=np.float).shape[0]
print('#features:', nFeatures)
In [ ]:
#np.array(list(data_train[0])[:-14], dtype=np.float)
Labels
In [ ]:
#nLabels = np.array(list(data_train[0])[-14:], dtype=np.int).shape[0]
nLabels = np.array(list(data_train[0])[120:], dtype=np.int).shape[0]
print('#labels:', nLabels)
In [ ]:
#np.array(list(data_train[0])[-14:], dtype=np.int)
In [ ]:
print('#training examples:', len(data_train))
In [ ]:
print('#test examples:', len(data_test))
Histogram of #positive labels.
In [ ]:
#nPositives = [np.sum(np.array(list(data_train[ix])[-14:], dtype=np.int)) for ix in range(len(data_train))]
nPositives = [np.sum(np.array(list(data_train[ix])[120:], dtype=np.int)) for ix in range(len(data_train))]
In [ ]:
pd.Series(nPositives).hist(bins=10)
In [ ]:
def create_dataset(label_ix, data):
"""
Create the labelled dataset for a given label index
Input:
- label_ix: label index, number in { 0, ..., # labels }
- data: original data with features + labels
Output:
- (Feature, Label) pair (X, y)
X comprises the features for each example
y comprises the labels of the corresponding example
"""
assert(label_ix >= 0)
assert(label_ix < nLabels)
N = len(data)
d = nFeatures
#magic = -14
magic = 120
X = np.zeros((N, d), dtype = np.float)
y = np.zeros(N, dtype = np.int)
for i in range(N):
X[i, :] = list(data[i])[:magic]
y[i] = list(data[i])[magic:][label_ix]
return X, y
In [ ]:
def create_dataset_v2(data):
"""
Create the labelled dataset for a given label index
Input:
- data: original data with features + labels
Output:
- (Feature, Label) pair (X, y)
X comprises the features for each example
Y comprises the labels of the corresponding example
"""
N = len(data)
D = nFeatures
L = nLabels
#magic = -14
magic = 120
X = np.zeros((N, D), dtype = np.float)
Y = np.zeros((N, L), dtype = np.int)
for i in range(N):
X[i, :] = list(data[i])[:magic]
Y[i, :] = list(data[i])[magic:]
return X, Y
The sigmoid function.
In [ ]:
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
Loss between a ground truth and a prediction.
In [ ]:
def evalPred(truth, pred, lossType = 'Hamming'):
"""
Compute loss given ground truth and prediction
Input:
- truth: binary array of true labels
- pred: real-valued array of predictions
- lossType: can be subset 0-1, Hamming, ranking, and Precision@K where K = # positive labels.
"""
assert(len(truth) == len(pred))
L = len(truth)
nPos = np.sum(truth)
predBin = np.array((pred > 0), dtype=np.int)
if lossType == 'Subset01':
return 1 - int(np.all(truth == predBin))
elif lossType == 'Hamming':
return np.sum(truth != predBin) / L
elif lossType == 'Ranking':
loss = 0
for i in range(L-1):
for j in range(i+1, L):
if truth[i] > truth[j]:
if pred[i] < pred[j]:
loss += 1
if pred[i] == pred[j]:
loss += 0.5
#return loss / (nPos * (L-nPos))
return loss
elif lossType == 'Precision@K':
# sorted indices of the labels most likely to be +'ve
idx = np.argsort(pred)[::-1]
# true labels according to the sorted order
y = truth[idx]
# fraction of +'ves in the top K predictions
return np.mean(y[:nPos])if nPos > 0 else 0
elif lossType == 'Precision@3':
# sorted indices of the labels most likely to be +'ve
idx = np.argsort(pred)[::-1]
# true labels according to the sorted order
y = truth[idx]
# fraction of +'ves in the top K predictions
return np.mean(y[:3])
elif lossType == 'Precision@5':
# sorted indices of the labels most likely to be +'ve
idx = np.argsort(pred)[::-1]
# true labels according to the sorted order
y = truth[idx]
# fraction of +'ves in the top K predictions
return np.mean(y[:5])
else:
assert(False)
In [ ]:
def avgPrecisionK(allTruths, allPreds):
losses = []
lossType = 'Precision@K'
for i in range(allPreds.shape[0]):
pred = allPreds[i, :]
truth = allTruths[i, :]
losses.append(evalPred(truth, pred, lossType))
return np.mean(losses)
In [ ]:
def printEvaluation(allTruths, allPreds):
N = allTruths.shape[0]
print(N)
for lossType in ['Precision@K']:
# ['Subset01', 'Hamming', 'Ranking', 'Precision@K', 'Precision@3', 'Precision@5']:
losses = [ ]
for i in range(allPreds.shape[0]):
pred = allPreds[i, :]
truth = allTruths[i, :]
losses.append(evalPred(truth, pred, lossType))
#print(allPreds[i])
#print(pred)
#print(truth)
#break
#print('%24s: %1.4f' % ('Average %s Loss' % lossType, np.mean(losses)))
print('%s: %1.4f, %.3f' % ('Average %s' % lossType, np.mean(losses), np.std(losses) / np.sqrt(N)))
#plt.hist(aucs, bins = 10);
Train a logistic regression model for each label.
In [ ]:
classifiers = [ LogisticRegression(class_weight = 'balanced', C = 10**0) for i in range(nLabels) ]
#classifiers = [ LogisticRegression(class_weight = 'balanced', C = 10) for i in range(nLabels) ]
In [ ]:
allPreds_train = [ ]
allPreds_test = [ ]
allTruths_train = [ ]
allTruths_test = [ ]
coefMat = [ ]
labelIndices = [ ]
for label_ix in range(nLabels):
print('Training for Label %d' % (label_ix+1))
X_train, y_train = create_dataset(label_ix, data = data_train)
X_test, y_test = create_dataset(label_ix, data = data_test)
allTruths_train.append(y_train)
allTruths_test.append(y_test)
assert( (not np.all(y_train == 0)) and (not np.all(y_train == 1)) )
classifiers[label_ix].fit(X_train, y_train)
allPreds_train.append(classifiers[label_ix].decision_function(X_train))
allPreds_test.append(classifiers[label_ix].decision_function(X_test))
coefMat.append(classifiers[label_ix].coef_.reshape(-1))
#labelIndices.append(label_ix)
#print(classifiers[label_ix].coef_)
#print(classifiers[label_ix].intercept_)
In [ ]:
allTruths_train = np.array(allTruths_train).T
allTruths_test = np.array(allTruths_test).T
allPreds_train = np.array(allPreds_train).T
allPreds_test = np.array(allPreds_test).T
print(allPreds_test.shape)
print(allTruths_test.shape)
In [ ]:
#allPreds[0]
In [ ]:
print('Training set:')
printEvaluation(allTruths_train, allPreds_train)
In [ ]:
print('Test set:')
printEvaluation(allTruths_test, allPreds_test)
Coefficient matrix (#Genres, #Songs)
.
In [ ]:
coefMat = np.array(coefMat).T
In [ ]:
coefMat.shape
In [ ]:
#sns.heatmap(coefMat[:, :30])
Train a regression model with exponential loss for each label.
In [ ]:
def obj_exp(w, X, y, C):
"""
Objective with L2 regularisation and exponential loss
Input:
- w: current weight vector
- X: feature matrix, N x D
- y: label vector, N x 1
- C: regularisation constant
"""
assert(len(y) == X.shape[0])
assert(len(w) == X.shape[1])
assert(C >= 0)
N, D = X.shape
J = 0.0 # cost
g = np.zeros_like(w) # gradient
for n in range(N):
x = X[n, :]
prod = np.dot(w, x)
# negative label
if y[n] == 0:
t1 = np.exp(prod)
J += t1
g = g + t1 * x
# positive label
else:
t2 = np.exp(-prod)
J += t2
g = g - t2 * x
J = 0.5 * C * np.dot(w, w) + J / N
g = C * w + g / N
return (J, g)
Check gradient.
In [ ]:
#%%script false
X_train_, y_train_ = create_dataset(3, data = data_train)
w0 = np.random.rand(X_train_.shape[1])
C = 1
check_grad(lambda w: obj_exp(w, X_train_, y_train_, C)[0], \
lambda w: obj_exp(w, X_train_, y_train_, C)[1], w0)
In [ ]:
params = [ ]
allPreds_train = [ ]
allPreds_test = [ ]
allTruths_train = [ ]
allTruths_test = [ ]
np.random.seed(SEED)
C = 1
for label_ix in range(nLabels):
#sys.stdout.write('\r%d / %d' % (label_ix + 1, nLabels))
#sys.stdout.flush()
print('\r%d / %d ' % (label_ix + 1, nLabels))
X_train, y_train = create_dataset(label_ix, data = data_train)
X_test, y_test = create_dataset(label_ix, data = data_test)
allTruths_train.append(y_train)
allTruths_test.append(y_test)
assert( (not np.all(y_train == 0)) and (not np.all(y_train == 1)) )
opt_method = 'BFGS' #'Newton-CG'
#opt_method = 'nelder-mead'
options = {'disp': True}
w0 = np.random.rand(X_train.shape[1]) # initial guess
opt = minimize(obj_exp, w0, args=(X_train, y_train, C), method=opt_method, jac=True, options=options)
if opt.success == True:
w = opt.x
params.append(w)
#allPreds.append(sigmoid(np.dot(X_test, w)))
allPreds_train.append(np.dot(X_train, w))
allPreds_test.append(np.dot(X_test, w))
else:
sys.stderr.write('Optimisation failed, label_ix=%d\n' % label_ix)
w = np.zeros(X_train.shape[1])
params.append(w)
#allPreds_test.append(np.dot(X_test, w))
In [ ]:
allTruths_train = np.array(allTruths_train).T
allTruths_test = np.array(allTruths_test).T
allPreds_train = np.array(allPreds_train).T
allPreds_test = np.array(allPreds_test).T
print(allPreds_test.shape)
print(allTruths_test.shape)
In [ ]:
#allPreds[0]
In [ ]:
print('Training set:')
printEvaluation(allTruths_train, allPreds_train)
In [ ]:
print('Test set:')
printEvaluation(allTruths_test, allPreds_test)
Train a bipartite ranking model for each label.
In [ ]:
#%load_ext Cython
In [ ]:
#%%cython -a
import numpy as np
#cimport numpy as np
#cpdef obj_biranking(w, X, y):
def obj_biranking(w, X, y, C):
"""
Objective with L2 regularisation and bipartite ranking loss
Input:
- w: current weight vector
- X: feature matrix, N x D
- y: label vector, N x 1
- C: regularisation constant
"""
assert(len(y) == X.shape[0])
assert(len(w) == X.shape[1])
assert(C >= 0)
#cdef int nPos, nNeg, i, j
#cdef double J, term, denom
nPos = np.sum(y) # num of positive examples
nNeg = len(y) - nPos # num of negative examples
ixPos = np.nonzero(y)[0].tolist() # indices positive examples
ixNeg = list(set(np.arange(len(y))) - set(ixPos)) # indices negative examples
J = 0.0 # cost
g = np.zeros_like(w) # gradient
scorePos = X[ixPos, :].dot(w)[:,np.newaxis] # nPos x 1
scoreNeg = X[ixNeg, :].dot(w)[:,np.newaxis] # nNeg x 1
scoreDif = scorePos - scoreNeg.T # nPos x nNeg
#J = np.mean(np.log(1 + np.exp(-scoreDif)))
J = 0.5 * C * np.dot(w, w) + np.mean(np.log1p(np.exp(-scoreDif)))
A = -1/(1 + np.exp(scoreDif))
T1 = X[ixPos, :].T.dot(A.sum(axis = 1))
T2 = X[ixNeg, :].T.dot(A.sum(axis = 0))
g = C * w + 1/(nPos * nNeg) * (T1 - T2)
return (J, g)
Check gradient.
In [ ]:
X_train_, y_train_ = create_dataset(6, data = data_train)
In [ ]:
#%%script false
w0 = w = np.random.rand(X_train_.shape[1])
C = 1
check_grad(lambda w: obj_biranking(w, X_train_, y_train_, C)[0], \
lambda w: obj_biranking(w, X_train_, y_train_, C)[1], w0)
In [ ]:
#1.1331503772158218e-06 * np.sqrt(nLabels)
In [ ]:
params = [ ]
allPreds_train = [ ]
allPreds_test = [ ]
allTruths_train = [ ]
allTruths_test = [ ]
np.random.seed(SEED)
C = 1
for label_ix in range(nLabels):
#sys.stdout.write('\r%d / %d' % (label_ix + 1, nLabels))
#sys.stdout.flush()
print('\r%d / %d ' % (label_ix + 1, nLabels))
X_train, y_train = create_dataset(label_ix, data = data_train)
X_test, y_test = create_dataset(label_ix, data = data_test)
allTruths_train.append(y_train)
allTruths_test.append(y_test)
assert( (not np.all(y_train == 0)) and (not np.all(y_train == 1)) )
opt_method = 'BFGS' #'Newton-CG'
#opt_method = 'nelder-mead'
options = {'disp': True}
w0 = np.random.rand(X_train.shape[1]) # initial guess
opt = minimize(obj_biranking, w0, args=(X_train, y_train, C), method=opt_method, jac=True, options=options)
if opt.success == True:
w = opt.x
params.append(w)
#allPreds.append(sigmoid(np.dot(X_test, w)))
allPreds_train.append(np.dot(X_train, w))
allPreds_test.append(np.dot(X_test, w))
else:
sys.stderr.write('Optimisation failed, label_ix=%d\n' % label_ix)
w = np.zeros(X_train.shape[1])
params.append(w)
allPreds_test.append(np.dot(X_test, w))
In [ ]:
allTruths_train = np.array(allTruths_train).T
allTruths_test = np.array(allTruths_test).T
allPreds_train = np.array(allPreds_train).T
allPreds_test = np.array(allPreds_test).T
print(allPreds_test.shape)
print(allTruths_test.shape)
In [ ]:
#allPreds[0]
In [ ]:
print('Training set:')
printEvaluation(allTruths_train, allPreds_train)
In [ ]:
print('Test set:')
printEvaluation(allTruths_test, allPreds_test)
Multi-label learning with ranking loss.
In [ ]:
def obj_ranking_loop(w, X, Y, C):
"""
Objective with L2 regularisation and ranking loss
Input:
- w: current weight vector, flattened L x D
- X: feature matrix, N x D
- Y: label matrix, N x L
- C: regularisation constant
"""
N, D = X.shape
L = Y.shape[1]
assert(w.shape[0] == L * D)
W = w.reshape(L, D) # reshape weight matrix
J = 0.0 # cost
G = np.zeros_like(W) # gradient matrix
for n in range(N):
Jn = 0.0
Gn = np.zeros_like(W)
x = X[n, :]
y = Y[n, :]
nPos = np.sum(y) # num of positive examples
nNeg = L - nPos # num of negative examples
denom = nPos * nNeg
ixPos = np.nonzero(y)[0].tolist() # indices positive examples
ixNeg = list(set(np.arange(L)) - set(ixPos)) # indices negative examples
for i in ixPos:
for j in ixNeg:
wDiff = W[i, :] - W[j, :]
sDiff = np.dot(wDiff, x)
term = np.exp(sDiff)
Jn += np.log1p(1.0 / term)
Gn[i, :] = Gn[i, :] - x / (1 + term)
#for j in ixNeg:
# for i in ixPos:
# wDiff = W[i, :] - W[j, :]
# sDiff = np.dot(wDiff, x)
# term = np.exp(sDiff)
Gn[j, :] = Gn[j, :] + x / (1 + term)
J += Jn / denom
G = G + Gn / denom
J = 0.5 * C * np.dot(w, w) + J / N
G = C * W + G / N
return (J, G.ravel())
In [ ]:
#np.tile([1,2,3], (3,1)) * np.array([0.1, 0.2, 0.3])[:, None]
In [ ]:
#np.tile([1,2,3], (3,1)) / np.array([0.1, 0.2, 0.3])[:, None]
In [ ]:
#np.tile([1,2,3], (3,1)) * np.array([0.1, 0.2, 0.3])[:,]
In [ ]:
#np.tile([1,2,3], (3,1)) / np.array([0.1, 0.2, 0.3])[:,]
In [ ]:
def obj_ranking(w, X, Y, C):
"""
Objective with L2 regularisation and ranking loss
Input:
- w: current weight vector, flattened L x D
- X: feature matrix, N x D
- Y: label matrix, N x L
- C: regularisation constant
"""
N, D = X.shape
L = Y.shape[1]
assert(w.shape[0] == L * D)
W = w.reshape(L, D) # reshape weight matrix
J = 0.0 # cost
G = np.zeros_like(W) # gradient matrix
for n in range(N):
Jn = 0.0
Gn = np.zeros_like(W)
x = X[n, :]
y = Y[n, :]
nPos = np.sum(y) # num of positive examples
nNeg = L - nPos # num of negative examples
denom = nPos * nNeg
ixPos = np.nonzero(y)[0].tolist() # indices positive examples
ixNeg = list(set(np.arange(L)) - set(ixPos)) # indices negative examples
ixmat = np.array(list(itertools.product(ixPos, ixNeg))) # shape: ixPos*ixNeg by 2
dW = W[ixmat[:, 0], :] - W[ixmat[:, 1], :]
sVec = np.dot(dW, x)
Jn = np.sum(np.log1p(np.exp(-sVec)))
coeffVec = np.divide(1, 1 + np.exp(sVec))
coeffPos = pd.DataFrame(coeffVec)
coeffPos['gid'] = ixmat[:, 0]
coeffPos = coeffPos.groupby('gid', sort=False).sum()
coeffNeg = pd.DataFrame(coeffVec)
coeffNeg['gid'] = ixmat[:, 1]
coeffNeg = coeffNeg.groupby('gid', sort=False).sum()
#print(coeffPos)
#print(coeffNeg)
coeffs = np.ones(L)
coeffs[ixPos] = -coeffPos.loc[ixPos].values.squeeze()
coeffs[ixNeg] = coeffNeg.loc[ixNeg].values.squeeze()
#print(coeffs)
Gn = np.tile(x, (L, 1)) * coeffs[:, None]
J += Jn / denom
G = G + Gn / denom
J = 0.5 * C * np.dot(w, w) + J / N
G = C * W + G / N
return (J, G.ravel())
In [ ]:
X_train, Y_train = create_dataset_v2(data = data_train)
X_test, Y_test = create_dataset_v2(data = data_test)
Check gradient
In [ ]:
#%%script false
C = 1
w0 = np.random.rand(nFeatures * nLabels)
check_grad(lambda w: obj_ranking(w, X_train[:10], Y_train[:10], C)[0], \
lambda w: obj_ranking(w, X_train[:10], Y_train[:10], C)[1], w0)
In [ ]:
allTruths_train = Y_train
allTruths_test = Y_test
allPreds_train = None
allPreds_test = None
np.random.seed(SEED)
opt_method = 'BFGS' #'Newton-CG'
#opt_method = 'nelder-mead'
options = {'disp': True}
C = 1
w0 = np.random.rand(nFeatures * nLabels) # initial guess
opt = minimize(obj_ranking, w0, args=(X_train, Y_train, C), method=opt_method, jac=True, options=options)
if opt.success == True:
w = opt.x
#allPreds = sigmoid(np.dot(X_test, w.reshape(nLabels, nFeatures).T))
allPreds_train = np.dot(X_train, w.reshape(nLabels, nFeatures).T)
allPreds_test = np.dot(X_test, w.reshape(nLabels, nFeatures).T)
else:
sys.stderr.write('Optimisation failed')
In [ ]:
print(allPreds_test.shape)
print(allTruths_test.shape)
In [ ]:
print('Training set:')
printEvaluation(allTruths_train, allPreds_train)
In [ ]:
print('Test set:')
printEvaluation(allTruths_test, allPreds_test)
Multi-label learning with p-norm push loss.
In [ ]:
def obj_pnorm_push_loop(w, X, Y, p, C):
"""
Objective with L2 regularisation and p-norm push loss
Input:
- w: current weight vector, flattened L x D
- X: feature matrix, N x D
- Y: label matrix, N x L
- p: constant for p-norm push loss
- C: regularisation constant
"""
N, D = X.shape
L = Y.shape[1]
assert(w.shape[0] == L * D)
assert(p >= 1)
assert(C >= 0)
W = w.reshape(L, D) # reshape weight matrix
J = 0.0 # cost
G = np.zeros_like(W) # gradient matrix
for n in range(N):
Gn = np.zeros_like(W)
x = X[n, :]
y = Y[n, :]
nPos = np.sum(y) # num of positive examples
nNeg = L - nPos # num of negative examples
for k in range(L):
wk = W[k, :]
term = np.dot(wk, x)
if y[k] == 1:
term2 = np.exp(-term) / nPos
J += term2
Gn[k, :] = -x * term2
else:
term2 = np.exp(p * term) / nNeg
J += term2 / p
Gn[k, :] = x * term2
G = G + Gn
J = 0.5 * C * np.dot(w, w) + J / N
G = C * W + G / N
return (J, G.ravel())
In [ ]:
def obj_pnorm_push_loopn(w, X, Y, p, C):
"""
Objective with L2 regularisation and p-norm push loss
Input:
- w: current weight vector, flattened L x D
- X: feature matrix, N x D
- Y: label matrix, N x L
- p: constant for p-norm push loss
- C: regularisation constant
"""
N, D = X.shape
L = Y.shape[1]
assert(w.shape[0] == L * D)
assert(p >= 1)
assert(C >= 0)
W = w.reshape(L, D) # reshape weight matrix
J = 0.0 # cost
G = np.zeros_like(W) # gradient matrix
for n in range(N):
Gn = np.zeros_like(W)
x = X[n, :]
y = Y[n, :]
nPos = np.sum(y) # num of positive examples
nNeg = L - nPos # num of negative examples
ixPos = np.nonzero(y)[0].tolist() # indices positive examples
ixNeg = list(set(np.arange(L)) - set(ixPos)) # indices negative examples
scalingPos = np.exp( -np.dot(W[ixPos, :], x)) / nPos
scalingNeg = np.exp(p * np.dot(W[ixNeg, :], x)) / nNeg
Gn[ixPos, :] = np.tile(-x, (nPos,1)) * scalingPos[:, None] # scaling each row of a matrix
Gn[ixNeg, :] = np.tile( x, (nNeg,1)) * scalingNeg[:, None] # with a different scalar
J += np.sum(scalingPos) + np.sum(scalingNeg) / p
G = G + Gn
J = 0.5 * C * np.dot(w, w) + J / N
G = C * W + G / N
return (J, G.ravel())
In [ ]:
def obj_pnorm_push(w, X, Y, p, C):
"""
Objective with L2 regularisation and p-norm push loss
Input:
- w: current weight vector, flattened L x D
- X: feature matrix, N x D
- Y: label matrix, N x L
- p: constant for p-norm push loss
- C: regularisation constant
"""
N, D = X.shape
L = Y.shape[1]
assert(w.shape[0] == L * D)
assert(p >= 1)
assert(C >= 0)
W = w.reshape(L, D) # reshape weight matrix
J = 0.0 # cost
G = np.zeros_like(W) # gradient matrix
for k in range(nLabels):
wk = W[k, :]
Yk = Y[:, k]
sPosVec = np.dot(X[Yk == 1, :], wk) # Nk+ by 1
sNegVec = np.dot(X[Yk == 0, :], wk) # NK- by 1
#nPosVec = np.sum(Y[Yk == 1, :], axis=1) # Nk+ by 1
#nNegVec = np.sum(Y[Yk == 0, :], axis=1) # NK- by 1
nPosVec = np.sum(Y[Yk == 1, :], axis=1) + 0.1 # Nk+ by 1 with smoothing
nNegVec = np.sum(Y[Yk == 0, :], axis=1) + 0.1 # NK- by 1 with smoothing
#nPosVec = np.ones_like(sPosVec) * N
#nNegVec = np.ones_like(sNegVec) * N
lossPos = np.divide(np.exp(-sPosVec), nPosVec) # NK+ by 1
lossNeg = np.divide(np.exp(p * sNegVec), nNegVec) # NK- by 1
J += np.sum(lossPos) + np.sum(lossNeg / p)
GradPos = -X[Yk == 1, :] * lossPos[:, None]
GradNeg = X[Yk == 0, :] * lossNeg[:, None]
G[k, :] = np.sum(GradPos, axis=0) + np.sum(GradNeg, axis=0)
J = 0.5 * C * np.dot(w, w) + J / N
G = C * W + G / N
return (J, G.ravel())
In [ ]:
X_train, Y_train = create_dataset_v2(data = data_train)
X_test, Y_test = create_dataset_v2(data = data_test)
Check gradient
In [ ]:
%%script false
p = 1
C = 1
w0 = np.random.rand(nFeatures * nLabels)
check_grad(lambda w: obj_pnorm_push(w, X_train, Y_train, p, C)[0], \
lambda w: obj_pnorm_push(w, X_train, Y_train, p, C)[1], w0)
In [ ]:
allTruths_train = Y_train
allTruths_test = Y_test
allPreds_train = None
allPreds_test = None
np.random.seed(SEED)
p = 1 # [1, 10]
C = 1 # [0, 1]
opt_method = 'BFGS' #'Newton-CG'
#opt_method = 'nelder-mead'
options = {'disp': True}
w0 = np.random.rand(nFeatures * nLabels) # initial guess
opt = minimize(obj_pnorm_push, w0, args=(X_train, Y_train, p, C), method=opt_method, jac=True, options=options)
if opt.success == True:
w = opt.x
allPreds_train = np.dot(X_train, w.reshape(nLabels, nFeatures).T)
allPreds_test = np.dot(X_test, w.reshape(nLabels, nFeatures).T)
else:
sys.stderr.write('Optimisation failed')
In [ ]:
print(allPreds_test.shape)
print(allTruths_test.shape)
In [ ]:
print('Training set:')
printEvaluation(allTruths_train, allPreds_train)
In [ ]:
print('Test set:')
printEvaluation(allTruths_test, allPreds_test)
In [ ]:
#make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
#cross_val_score(clf, iris.data, iris.target, cv=cv)
In [ ]:
#%%script false
precisions_train = dict()
precisions_test = dict()
allTruths_test = Y_test
allTruths_train = Y_train
p_set = [1, 3, 10, 30]
C_set = [0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000, 3000]
opt_method = 'BFGS' #'Newton-CG'
for p in p_set:
for C in C_set:
print('-------------------------------------')
print('p in loss: {}, C for regularisation: {}'.format(p, C))
allPreds = None
allPreds_train = None
w0 = np.random.rand(nFeatures * nLabels) # initial guess
opt = minimize(obj_pnorm_push, w0, args=(X_train, Y_train, p, C), method=opt_method, jac=True)
if opt.success == True:
w = opt.x
allPreds_test = np.dot(X_test, w.reshape(nLabels, nFeatures).T)
allPreds_train = np.dot(X_train, w.reshape(nLabels, nFeatures).T)
precisions_train[(p,C)] = avgPrecisionK(allTruths_train, allPreds_train)
precisions_test[(p,C)] = avgPrecisionK(allTruths_test, allPreds_test)
else:
sys.stderr.write('Optimisation failed')
precisions_train[(p,C)] = 0
precisions_test[(p,C)] = 0
print('%20s %.4f' % ('Average Precision@K on training set: ', precisions_train[(p,C)]))
print('%20s %.4f\n' % ('Average Precision@K on test set: ', precisions_test[(p,C)]))
In [ ]:
#%%script false
fig = plt.figure(figsize=[10, 8])
ax = plt.subplot(1,1,1)
#colors = itertools.cycle(['r', 'g'])
styles = itertools.cycle(['-', '--', ':', '-.'])
for p in p_set:
ls = styles.__next__()
plt.plot(np.arange(len(C_set)), [precisions_train[(p,C)] for C in C_set], \
ls=ls, c='r', label='p=%d'%p + ', train')
plt.plot(np.arange(len(C_set)), [precisions_test[(p,C)] for C in C_set], \
ls=ls, c='g', label='p=%d'%p + ', test')
plt.plot(np.arange(len(C_set)), [0.5149 for C in C_set], ls='-', c='b', label='Logistic Regression, test')
plt.legend(loc='best')
plt.xticks(np.arange(len(C_set)), C_set, fontsize=10, rotation=0, horizontalalignment='center')
plt.xlabel('Regularisation Constant')
plt.ylabel('Average Precision@K')
plt.title('Performance on Yeast dataset, multi-label learning with p-norm push loss', fontsize=15)
fig.savefig('pnorm.svg')
Methods to compute $v = (\sum_{n=1}^N A_{N \times K})^\top (\sum_{n=1}^N A_{N \times K})$:
NOTE: if $D_{K \times K} = A_{N \times K}^\top A_{N \times K}$, then $v \ne \sum_{k=1}^K \sum_{l=1}^L D_{K \times K}$, this can be checked with a simple $A_{2x2} = [A_{11}, A_{12}; A_{21}, A_{22}]$ matrix.
Example:
In [ ]:
A = np.random.rand(15).reshape(3,5)
A
In [ ]:
x = np.sum(A, axis=0)
#x
In [ ]:
C = np.dot(A, A.T)
one = np.ones(A.shape[0])
In [ ]:
np.dot(x, x)
In [ ]:
np.sum(np.sum(C))
In [ ]:
np.dot(np.dot(one, C), one)
In [ ]:
D = np.dot(A.T, A)
np.sum(np.sum(D))
Speed test.
In [ ]:
A = np.random.rand(20000000).reshape(10000,2000)
#A = np.random.rand(10000000).reshape(2000,10000)
In [ ]:
%%timeit
x = np.sum(A, axis=0)
np.dot(x, x)
In [ ]:
%%timeit
C = np.dot(A, A.T)
np.sum(np.sum(C))
In [ ]:
%%timeit
C = np.dot(A, A.T)
one = np.ones(A.shape[0])
np.dot(np.dot(one, C), one)