In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, make_scorer, f1_score, label_ranking_loss
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
sys.path.append('src')
from evaluate import avgPrecisionK, evaluatePrecision, evaluateF1, evaluateRankingLoss, f1_score_nowarn
from datasets import create_dataset, dataset_names, nLabels_dict
In [ ]:
dataset_names
In [ ]:
data_ix = 3
In [ ]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)
In [ ]:
data_dir = 'data'
SEED = 918273645
fmodel_prec = os.path.join(data_dir, 'br-' + dataset_name + '-prec.pkl')
fmodel_f1 = os.path.join(data_dir, 'br-' + dataset_name + '-f1.pkl')
fmodel_base = os.path.join(data_dir, 'br-' + dataset_name + '-base.pkl')
fperf_prec = os.path.join(data_dir, 'perf-lr-prec.pkl')
fperf_f1 = os.path.join(data_dir, 'perf-lr-f1.pkl')
fperf_base = os.path.join(data_dir, 'perf-lr-base.pkl')
Load data.
In [ ]:
X_train, Y_train = create_dataset(dataset_name=dataset_name, train_data=True, shuffle=True, random_state=SEED)
X_test, Y_test = create_dataset(dataset_name=dataset_name, train_data=False)
Feature normalisation.
In [ ]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test -= X_train_mean
X_test /= X_train_std
Use the estimated probability for each label as the predicted score for any example.
In [ ]:
#probs = np.mean(Y_train, axis=0)
In [ ]:
#probs
In [ ]:
#preds = np.tile(probs, (X_test.shape[0], 1))
In [ ]:
#evaluatePrecision(Y_test, preds, verbose=1)
In [ ]:
#evaluateRankingLoss(Y_test, preds, n_jobs=4)
Train a logistic regression model for each label.
Note that OneVsRestClassifier can be either a multiclass classifier or a multilabel classifier, see this binary relevance example on yeast dataset.
NOTE: To do cross validation (i.e. fit a GridSearchCV), one has to put OneVsRestClassifier into a class wrapper, as the constructor of OneVsRestClassifier doesn't have a parameter C
.
In [ ]:
class BinaryRelevance(BaseEstimator):
"""
Independent logistic regression based on OneVsRestClassifier wrapper.
"""
def __init__(self, C=1, n_jobs=-1):
assert C > 0
self.C = C
self.n_jobs = n_jobs
self.trained = False
def fit(self, X_train, Y_train):
assert X_train.shape[0] == Y_train.shape[0]
# don't make two changes at the same time
#self.estimator = OneVsRestClassifier(LogisticRegression(class_weight='balanced', C=self.C))
self.estimator = OneVsRestClassifier(LogisticRegression(C=self.C), n_jobs=self.n_jobs)
self.estimator.fit(X_train, Y_train)
self.trained = True
def decision_function(self, X_test):
assert self.trained is True
return self.estimator.decision_function(X_test)
def predict(self, X_test, binarise=False):
preds = self.decision_function(X_test)
return preds >= 0.5 if binarise is True else preds
In [ ]:
def print_cv_results(clf):
if hasattr(clf, 'best_params_'):
print("\nBest parameters set found on development set:")
print(clf.best_params_)
if hasattr(clf, 'cv_results_'):
for mean, std, params in zip(clf.cv_results_['mean_test_score'], \
clf.cv_results_['std_test_score'], \
clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
In [ ]:
def dump_results(predictor, X_train, Y_train, X_test, Y_test, fname):
"""
Compute and save performance results
"""
preds_train = predictor.decision_function(X_train)
preds_test = predictor.decision_function(X_test)
print('Training set:')
perf_dict_train = evaluatePrecision(Y_train, preds_train)
print()
print('Test set:')
perf_dict_test = evaluatePrecision(Y_test, preds_test)
print()
print('Training set:')
perf_dict_train.update(evaluateRankingLoss(Y_train, preds_train))
print(label_ranking_loss(Y_train, preds_train))
print()
print('Test set:')
perf_dict_test.update(evaluateRankingLoss(Y_test, preds_test))
print(label_ranking_loss(Y_test, preds_test))
F1_train = f1_score_nowarn(Y_train, preds_train >= 0.5, average='samples')
F1_test = f1_score_nowarn(Y_test, preds_test >= 0.5, average='samples')
print('\nF1 Train: %.4f, %f' % (F1_train, f1_score(Y_train, preds_train >= 0.5, average='samples')))
print('\nF1 Test : %.4f %f' % (F1_test, f1_score(Y_test, preds_test >= 0.5, average='samples')))
perf_dict_train.update({'F1': (F1_train,)})
perf_dict_test.update({'F1': (F1_test,)})
perf_dict = {'Train': perf_dict_train, 'Test': perf_dict_test}
if os.path.exists(fname):
_dict = pkl.load(open(fname, 'rb'))
if dataset_name not in _dict:
_dict[dataset_name] = perf_dict
else:
_dict = {dataset_name: perf_dict}
pkl.dump(_dict, open(fname, 'wb'))
print()
print(pkl.load(open(fname, 'rb')))
In [ ]:
clf = BinaryRelevance(n_jobs=3)
clf.fit(X_train, Y_train)
In [ ]:
Y_pred = clf.decision_function(X_test) >= 0
In [ ]:
f1_score_nowarn(Y_test, Y_pred, average='samples')
In [ ]:
f1_score_nowarn(Y_test, Y_pred, average='macro')
In [ ]:
pkl.dump(clf, open(fmodel_base, 'wb'))
In [ ]:
def avgF1(Y_true, Y_pred):
F1 = f1_score_nowarn(Y_true, Y_pred >= 0, average='samples')
print('\nF1: %g, #examples: %g' % (F1, Y_true.shape[0]))
return F1
In [ ]:
C_set = [1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 0.01, 0.03, 0.1, 0.3,
1, 3, 10, 30, 100, 300, 1e3, 3e3, 1e4, 3e4, 1e5, 3e5, 1e6, 3e6]
parameters = [{'C': C_set}]
scorer = {'F1': make_scorer(avgF1)}
Cross validation according to F1.
In [ ]:
if os.path.exists(fmodel_f1):
clf = pkl.load(open(fmodel_f1, 'rb'))
else:
clf = GridSearchCV(BinaryRelevance(), parameters, cv=5, scoring=scorer, verbose=2, n_jobs=10, refit='F1')
clf.fit(X_train, Y_train)
pkl.dump(clf, open(fmodel_f1, 'wb'))
In [ ]:
f1_score_nowarn(Y_test, clf.decision_function(X_test) >= 0, average='samples')
In [ ]:
clf.best_params_
In [ ]:
print_cv_results(clf)
In [ ]:
dump_results(clf, X_train, Y_train, X_test, Y_test, fperf_prec)
Cross validation according to F1.
In [ ]:
plot_loss_of_clf(clf, X_train, Y_train, X_test, Y_test)
In [ ]:
from evaluate import calcLoss
from matplotlib.ticker import NullFormatter
def plot_loss_of_clf(clf, X_train, Y_train, X_test, Y_test):
preds_train = clf.decision_function(X_train)
tploss_train = calcLoss(Y_train, preds_train, 'TopPush', njobs=4)
pak_train = calcLoss(Y_train, preds_train, 'Precision@K', njobs=4)
preds_test = clf.decision_function(X_test)
tploss_test = calcLoss(Y_test, preds_test, 'TopPush', njobs=4)
pak_test = calcLoss(Y_test, preds_test, 'Precision@K', njobs=4)
#plot_loss(tploss_train, pak_train, 'Training set (' + dataset_name + ')')
plot_loss(tploss_test, pak_test, 'Test set (' + dataset_name + ')')
In [ ]:
def plot_loss(loss, pak, title):
# the data
x = loss
y = 1 - pak
print('away from diagonal portion:', np.mean(loss != 1-pak))
nullfmt = NullFormatter() # no labels
# definitions for the axes
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
bottom_h = left_h = left + width + 0.02
rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom_h, width, 0.2]
rect_histy = [left_h, bottom, 0.2, height]
# start with a rectangular Figure
plt.figure(1, figsize=(8, 8))
axScatter = plt.axes(rect_scatter)
axHistx = plt.axes(rect_histx)
axHisty = plt.axes(rect_histy)
# no labels
axHistx.xaxis.set_major_formatter(nullfmt)
axHisty.yaxis.set_major_formatter(nullfmt)
# the scatter plot:
axScatter.scatter(x, y, color='b', alpha=0.5)
axScatter.plot([0, 1], [0, 1], ls='--', color='g')
axScatter.set_xlabel('Top push loss', fontdict={'fontsize': 12})
axScatter.set_ylabel('1 - precision@K', fontdict={'fontsize': 12})
# now determine nice limits by hand:
#binwidth = 0.25
#xymax = np.max([np.max(np.fabs(x)), np.max(np.fabs(y))])
#lim = (int(xymax/binwidth) + 1) * binwidth
#axScatter.set_xlim((-lim, lim))
#axScatter.set_ylim((-lim, lim))
#bins = np.arange(-lim, lim + binwidth, binwidth)
axHistx.hist(x, bins=10, color='g', alpha=0.3)
axHistx.set_yscale('log')
axHisty.hist(y, bins=10, color='g', alpha=0.3, orientation='horizontal')
axHisty.set_xscale('log')
#axHistx.set_xlim(axScatter.get_xlim())
#axHisty.set_ylim(axScatter.get_ylim())
axHistx.set_title(title, fontdict={'fontsize': 15}, loc='center')
In [ ]:
%%script false
# NOTE: binary predictions (by predict()) are required for this method to work
if os.path.exists(fmodel_f1):
clf = pkl.load(open(fmodel_f1, 'rb'))
else:
scorer = make_scorer(f1_score_nowarn, average='samples')
clf = GridSearchCV(BinaryRelevance(), parameters, cv=5, scoring=scorer, verbose=2, n_jobs=6)
clf.fit(X_train, Y_train)
pkl.dump(clf, open(fmodel_f1, 'wb'))
print_cv_results(clf)
In [ ]:
#dump_results(clf, X_train, Y_train, X_test, Y_test, fperf_f1)
Plain logistic regression.
In [ ]:
if os.path.exists(fmodel_base):
clf = pkl.load(open(fmodel_base, 'rb'))
else:
clf = OneVsRestClassifier(LogisticRegression(verbose=1))
clf.fit(X_train, Y_train)
pkl.dump(clf, open(fmodel_base, 'wb'))
In [ ]:
dump_results(clf, X_train, Y_train, X_test, Y_test, fperf_base)
Cross validation for classifier of each label.
In [ ]:
%%script false
allPreds_train = [ ]
allPreds_test = [ ]
allTruths_train = [ ]
allTruths_test = [ ]
coefMat = [ ]
labelIndices = [ ]
ranges = range(-6, 7)
parameters = [{'C': sorted([10**(e) for e in ranges] + [3 * 10**(e) for e in ranges])}]
scoring = 'average_precision' # 'accuracy' #'precision_macro'
for label_ix in range(nLabels):
print('Training for Label %d' % (label_ix+1))
y_train = Y_train[:, label_ix]
y_test = Y_test [:, label_ix]
allTruths_train.append(y_train)
allTruths_test.append(y_test)
assert( (not np.all(y_train == 0)) and (not np.all(y_train == 1)) )
# searching for a baseline in (Lin et al.) with:
# test F1 on bibtex 0.372, 26.8
# test F1 on bookmarks 0.307, 0.219
# test F1 on delicious 0.265, 0.102
# test F1 on bibtex: 0.3730, 0.277
# test F1 on bookmarks: 0.2912, 0.2072
# test F1 on delicious: 0.1899, 0.1268
#clf = LogisticRegression(C=100)
# test F1 on bookmarks: 0.2928, 0.2109
#clf = LogisticRegression(C=60)
# test F1 on bibtex: 0.4282
#clf = GridSearchCV(LogisticRegression(class_weight='balanced'), parameters, cv=5, scoring=scoring)
# test F1 on bibtex: < 0.3
# test F1 on bookmarks: 0.2981, 0.2281
# test F1 on delicious: 0.1756, 0.0861
#clf = LogisticRegression()
# test F1 on bibtex: 0.4342
#clf = LogisticRegression(class_weight='balanced')
# test F1 on bibtex: 0.3018
#clf = GridSearchCV(LogisticRegression(), parameters, cv=5, scoring=scoring)
# test F1 on bibtex: 0.3139
#clf = GridSearchCV(LogisticRegression(), parameters, scoring=scoring)
# test F1 on bibtex: 0.4252
#clf = GridSearchCV(LogisticRegression(class_weight='balanced'), parameters, scoring=scoring)
# test F1 on bibtex: 0.3598
#clf = LogisticRegression(C=10)
# test F1 on bibtex: 0.3670
#clf = LogisticRegression(C=30)
estimator = LogisticRegression(class_weight='balanced')#, solver='lbfgs')
clf = GridSearchCV(estimator, parameters, cv=5, scoring=scoring, n_jobs=4)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print(clf.best_params_)
print()
allPreds_train.append(clf.decision_function(X_train))
allPreds_test.append(clf.decision_function(X_test))
allTruths_train = np.array(allTruths_train).T
allTruths_test = np.array(allTruths_test).T
allPreds_train = np.array(allPreds_train).T
allPreds_test = np.array(allPreds_test).T
print(allPreds_test.shape)
print(allTruths_test.shape)
In [ ]:
#coefMat = np.array(coefMat).T
#coefMat.shape
#sns.heatmap(coefMat[:, :30])
In [ ]:
#precisions_train = [avgPrecision(allTruths_train, allPreds_train, k) for k in range(1, nLabels+1)]
#precisions_test = [avgPrecision(allTruths_test, allPreds_test, k) for k in range(1, nLabels+1)]
In [ ]:
#precisionK_train = avgPrecisionK(allTruths_train, allPreds_train)
#precisionK_test = avgPrecisionK(allTruths_test, allPreds_test)
In [ ]:
%%script false
plt.figure(figsize=[10,5])
plt.plot(precisions_train, ls='--', c='r', label='Train')
plt.plot(precisions_test, ls='-', c='g', label='Test')
plt.plot([precisionK_train for k in range(nLabels)], ls='-', c='r', label='Train, Precision@K')
plt.plot([precisionK_test for k in range(nLabels)], ls='-', c='g', label='Test, Precision@K')
plt.xticks(np.arange(nLabels), np.arange(1,nLabels+1))
plt.xlabel('k')
plt.ylabel('Precision@k')
plt.legend(loc='best')
plt.title('Independent Logistic Regression on ' + dataset_name + ' dataset')
plt.savefig(dataset_name + '_lr.svg')