Multi-label classification -- binary relevance baseline


In [ ]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, make_scorer, f1_score, label_ranking_loss

import matplotlib.pyplot as plt
import seaborn as sns

In [ ]:
sys.path.append('src')
from evaluate import avgPrecisionK, evaluatePrecision, evaluateF1, evaluateRankingLoss, f1_score_nowarn
from datasets import create_dataset, dataset_names, nLabels_dict

In [ ]:
dataset_names

In [ ]:
data_ix = 3

In [ ]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)

In [ ]:
data_dir = 'data'
SEED = 918273645
fmodel_prec = os.path.join(data_dir, 'br-' + dataset_name + '-prec.pkl')
fmodel_f1 = os.path.join(data_dir, 'br-' + dataset_name + '-f1.pkl')
fmodel_base = os.path.join(data_dir, 'br-' + dataset_name + '-base.pkl')
fperf_prec = os.path.join(data_dir, 'perf-lr-prec.pkl')
fperf_f1 = os.path.join(data_dir, 'perf-lr-f1.pkl')
fperf_base = os.path.join(data_dir, 'perf-lr-base.pkl')

Load data.


In [ ]:
X_train, Y_train = create_dataset(dataset_name=dataset_name, train_data=True, shuffle=True, random_state=SEED)
X_test,  Y_test  = create_dataset(dataset_name=dataset_name, train_data=False)

Feature normalisation.


In [ ]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test  -= X_train_mean
X_test  /= X_train_std

Naive baseline

Use the estimated probability for each label as the predicted score for any example.


In [ ]:
#probs = np.mean(Y_train, axis=0)

In [ ]:
#probs

In [ ]:
#preds = np.tile(probs, (X_test.shape[0], 1))

In [ ]:
#evaluatePrecision(Y_test, preds, verbose=1)

In [ ]:
#evaluateRankingLoss(Y_test, preds, n_jobs=4)

Binary relevance baseline

Train a logistic regression model for each label.

Note that OneVsRestClassifier can be either a multiclass classifier or a multilabel classifier, see this binary relevance example on yeast dataset.

NOTE: To do cross validation (i.e. fit a GridSearchCV), one has to put OneVsRestClassifier into a class wrapper, as the constructor of OneVsRestClassifier doesn't have a parameter C.


In [ ]:
class BinaryRelevance(BaseEstimator):
    """
        Independent logistic regression based on OneVsRestClassifier wrapper.
    """
    
    def __init__(self, C=1, n_jobs=-1):
        assert C > 0
        self.C = C
        self.n_jobs = n_jobs
        self.trained = False
        
    def fit(self, X_train, Y_train):
        assert X_train.shape[0] == Y_train.shape[0]
        # don't make two changes at the same time
        #self.estimator = OneVsRestClassifier(LogisticRegression(class_weight='balanced', C=self.C))
        self.estimator = OneVsRestClassifier(LogisticRegression(C=self.C), n_jobs=self.n_jobs)
        self.estimator.fit(X_train, Y_train)
        self.trained = True
        
    def decision_function(self, X_test):
        assert self.trained is True
        return self.estimator.decision_function(X_test)
    
    def predict(self, X_test, binarise=False):
        preds = self.decision_function(X_test)
        return preds >= 0.5 if binarise is True else preds

In [ ]:
def print_cv_results(clf):
    if hasattr(clf, 'best_params_'):
        print("\nBest parameters set found on development set:")
        print(clf.best_params_)
    if hasattr(clf, 'cv_results_'):
        for mean, std, params in zip(clf.cv_results_['mean_test_score'], \
                                     clf.cv_results_['std_test_score'], \
                                     clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [ ]:
def dump_results(predictor, X_train, Y_train, X_test, Y_test, fname):
    """
        Compute and save performance results
    """
    preds_train = predictor.decision_function(X_train)
    preds_test  = predictor.decision_function(X_test)
    
    print('Training set:')
    perf_dict_train = evaluatePrecision(Y_train, preds_train)
    print()
    print('Test set:')
    perf_dict_test = evaluatePrecision(Y_test, preds_test)
    
    print()
    print('Training set:')
    perf_dict_train.update(evaluateRankingLoss(Y_train, preds_train))
    print(label_ranking_loss(Y_train, preds_train))
    print()
    print('Test set:')
    perf_dict_test.update(evaluateRankingLoss(Y_test, preds_test))
    print(label_ranking_loss(Y_test, preds_test))
    
    F1_train = f1_score_nowarn(Y_train, preds_train >= 0.5, average='samples')
    F1_test  = f1_score_nowarn(Y_test, preds_test >= 0.5, average='samples')
    print('\nF1 Train: %.4f, %f' % (F1_train, f1_score(Y_train, preds_train >= 0.5, average='samples')))
    print('\nF1 Test : %.4f  %f' % (F1_test, f1_score(Y_test, preds_test >= 0.5, average='samples')))
    
    perf_dict_train.update({'F1': (F1_train,)})
    perf_dict_test.update({'F1': (F1_test,)})
    
    perf_dict = {'Train': perf_dict_train, 'Test': perf_dict_test}
    if os.path.exists(fname):
        _dict = pkl.load(open(fname, 'rb'))
        if dataset_name not in _dict:
            _dict[dataset_name] = perf_dict
    else:
        _dict = {dataset_name: perf_dict}
    pkl.dump(_dict, open(fname, 'wb'))
    
    print()
    print(pkl.load(open(fname, 'rb')))

In [ ]:
clf = BinaryRelevance(n_jobs=3)
clf.fit(X_train, Y_train)

In [ ]:
Y_pred = clf.decision_function(X_test) >= 0

In [ ]:
f1_score_nowarn(Y_test, Y_pred, average='samples')

In [ ]:
f1_score_nowarn(Y_test, Y_pred, average='macro')

In [ ]:
pkl.dump(clf, open(fmodel_base, 'wb'))

In [ ]:
def avgF1(Y_true, Y_pred):
    F1 = f1_score_nowarn(Y_true, Y_pred >= 0, average='samples')
    print('\nF1: %g, #examples: %g' % (F1, Y_true.shape[0]))
    return F1

In [ ]:
C_set = [1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3, 3e-3, 0.01, 0.03, 0.1, 0.3,
         1, 3, 10, 30, 100, 300, 1e3, 3e3, 1e4, 3e4, 1e5, 3e5, 1e6, 3e6]
parameters = [{'C': C_set}]
scorer = {'F1': make_scorer(avgF1)}

Cross validation according to F1.


In [ ]:
if os.path.exists(fmodel_f1):
    clf = pkl.load(open(fmodel_f1, 'rb'))
else:
    clf = GridSearchCV(BinaryRelevance(), parameters, cv=5, scoring=scorer, verbose=2, n_jobs=10, refit='F1')
    clf.fit(X_train, Y_train)
    pkl.dump(clf, open(fmodel_f1, 'wb'))

In [ ]:
f1_score_nowarn(Y_test, clf.decision_function(X_test) >= 0, average='samples')

In [ ]:
clf.best_params_

In [ ]:
print_cv_results(clf)

In [ ]:
dump_results(clf, X_train, Y_train, X_test, Y_test, fperf_prec)

Cross validation according to F1.


In [ ]:
plot_loss_of_clf(clf, X_train, Y_train, X_test, Y_test)

In [ ]:
from evaluate import calcLoss
from matplotlib.ticker import NullFormatter

def plot_loss_of_clf(clf, X_train, Y_train, X_test, Y_test):
    preds_train = clf.decision_function(X_train)
    tploss_train = calcLoss(Y_train, preds_train, 'TopPush', njobs=4)
    pak_train = calcLoss(Y_train, preds_train, 'Precision@K', njobs=4)
    preds_test = clf.decision_function(X_test)
    tploss_test = calcLoss(Y_test, preds_test, 'TopPush', njobs=4)
    pak_test = calcLoss(Y_test, preds_test, 'Precision@K', njobs=4)
    #plot_loss(tploss_train, pak_train, 'Training set (' + dataset_name + ')')
    
    plot_loss(tploss_test, pak_test, 'Test set (' + dataset_name + ')')

In [ ]:
def plot_loss(loss, pak, title):
    # the data
    x = loss
    y = 1 - pak
    
    print('away from diagonal portion:', np.mean(loss != 1-pak))

    nullfmt = NullFormatter()         # no labels

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width + 0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    plt.figure(1, figsize=(8, 8))

    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # no labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # the scatter plot:
    axScatter.scatter(x, y, color='b', alpha=0.5)
    axScatter.plot([0, 1], [0, 1], ls='--', color='g')
    axScatter.set_xlabel('Top push loss', fontdict={'fontsize': 12})
    axScatter.set_ylabel('1 - precision@K', fontdict={'fontsize': 12})

    # now determine nice limits by hand:
    #binwidth = 0.25
    #xymax = np.max([np.max(np.fabs(x)), np.max(np.fabs(y))])
    #lim = (int(xymax/binwidth) + 1) * binwidth

    #axScatter.set_xlim((-lim, lim))
    #axScatter.set_ylim((-lim, lim))

    #bins = np.arange(-lim, lim + binwidth, binwidth)

    axHistx.hist(x, bins=10, color='g', alpha=0.3)
    axHistx.set_yscale('log')
    axHisty.hist(y, bins=10, color='g', alpha=0.3, orientation='horizontal')
    axHisty.set_xscale('log')

    #axHistx.set_xlim(axScatter.get_xlim())
    #axHisty.set_ylim(axScatter.get_ylim())

    axHistx.set_title(title, fontdict={'fontsize': 15}, loc='center')

In [ ]:
%%script false
# NOTE: binary predictions (by predict()) are required for this method to work 
if os.path.exists(fmodel_f1):
    clf = pkl.load(open(fmodel_f1, 'rb'))
else:
    scorer = make_scorer(f1_score_nowarn, average='samples')
    clf = GridSearchCV(BinaryRelevance(), parameters, cv=5, scoring=scorer, verbose=2, n_jobs=6)
    clf.fit(X_train, Y_train)
    pkl.dump(clf, open(fmodel_f1, 'wb'))
print_cv_results(clf)

In [ ]:
#dump_results(clf, X_train, Y_train, X_test, Y_test, fperf_f1)

Plain logistic regression.


In [ ]:
if os.path.exists(fmodel_base):
    clf = pkl.load(open(fmodel_base, 'rb'))
else:
    clf = OneVsRestClassifier(LogisticRegression(verbose=1))
    clf.fit(X_train, Y_train)
    pkl.dump(clf, open(fmodel_base, 'wb'))

In [ ]:
dump_results(clf, X_train, Y_train, X_test, Y_test, fperf_base)

Cross validation for classifier of each label.


In [ ]:
%%script false
allPreds_train  = [ ]
allPreds_test  = [ ]
allTruths_train = [ ]
allTruths_test = [ ]
coefMat = [ ]
labelIndices = [ ]

ranges = range(-6, 7)
parameters = [{'C': sorted([10**(e) for e in ranges] + [3 * 10**(e) for e in ranges])}]
scoring = 'average_precision' # 'accuracy' #'precision_macro'

for label_ix in range(nLabels):
    print('Training for Label %d' % (label_ix+1))
    
    y_train = Y_train[:, label_ix]
    y_test  = Y_test [:, label_ix]
    
    allTruths_train.append(y_train)
    allTruths_test.append(y_test) 
    
    assert( (not np.all(y_train == 0)) and (not np.all(y_train == 1)) )
    
    # searching for a baseline in (Lin et al.) with:
    # test F1 on bibtex 0.372, 26.8 
    # test F1 on bookmarks 0.307, 0.219
    # test F1 on delicious 0.265, 0.102
    
    # test F1 on bibtex: 0.3730, 0.277
    # test F1 on bookmarks: 0.2912, 0.2072
    # test F1 on delicious: 0.1899, 0.1268
    #clf = LogisticRegression(C=100)
    
    # test F1 on bookmarks: 0.2928, 0.2109
    #clf = LogisticRegression(C=60)    
    
    # test F1 on bibtex: 0.4282
    #clf = GridSearchCV(LogisticRegression(class_weight='balanced'), parameters, cv=5, scoring=scoring)
    
    # test F1 on bibtex: < 0.3
    # test F1 on bookmarks: 0.2981, 0.2281
    # test F1 on delicious: 0.1756, 0.0861
    #clf = LogisticRegression()  
    
    # test F1 on bibtex: 0.4342
    #clf = LogisticRegression(class_weight='balanced') 
    
    # test F1 on bibtex: 0.3018
    #clf = GridSearchCV(LogisticRegression(), parameters, cv=5, scoring=scoring)
    
    # test F1 on bibtex: 0.3139
    #clf = GridSearchCV(LogisticRegression(), parameters, scoring=scoring)
    
    # test F1 on bibtex: 0.4252
    #clf = GridSearchCV(LogisticRegression(class_weight='balanced'), parameters, scoring=scoring)
    
    # test F1 on bibtex: 0.3598
    #clf = LogisticRegression(C=10) 
    
    # test F1 on bibtex: 0.3670
    #clf = LogisticRegression(C=30)
    
    estimator = LogisticRegression(class_weight='balanced')#, solver='lbfgs')
    clf = GridSearchCV(estimator, parameters, cv=5, scoring=scoring, n_jobs=4)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print(clf.best_params_)
    print()
    
    allPreds_train.append(clf.decision_function(X_train))
    allPreds_test.append(clf.decision_function(X_test))
    
allTruths_train = np.array(allTruths_train).T
allTruths_test = np.array(allTruths_test).T

allPreds_train  = np.array(allPreds_train).T
allPreds_test  = np.array(allPreds_test).T

print(allPreds_test.shape)
print(allTruths_test.shape)

Result analysis


In [ ]:
#coefMat = np.array(coefMat).T
#coefMat.shape
#sns.heatmap(coefMat[:, :30])

In [ ]:
#precisions_train = [avgPrecision(allTruths_train, allPreds_train, k) for k in range(1, nLabels+1)]
#precisions_test  = [avgPrecision(allTruths_test,  allPreds_test,  k) for k in range(1, nLabels+1)]

In [ ]:
#precisionK_train = avgPrecisionK(allTruths_train, allPreds_train)
#precisionK_test  = avgPrecisionK(allTruths_test,  allPreds_test)

In [ ]:
%%script false
plt.figure(figsize=[10,5])
plt.plot(precisions_train, ls='--', c='r', label='Train')
plt.plot(precisions_test,  ls='-',  c='g', label='Test')
plt.plot([precisionK_train for k in range(nLabels)], ls='-', c='r', label='Train, Precision@K')
plt.plot([precisionK_test  for k in range(nLabels)], ls='-', c='g', label='Test, Precision@K')
plt.xticks(np.arange(nLabels), np.arange(1,nLabels+1))
plt.xlabel('k')
plt.ylabel('Precision@k')
plt.legend(loc='best')
plt.title('Independent Logistic Regression on ' + dataset_name + ' dataset')
plt.savefig(dataset_name + '_lr.svg')