In [ ]:
%matplotlib inline
%load_ext line_profiler
%load_ext autoreload
%autoreload 2
import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
from scipy.optimize import check_grad
from scipy.special import expit as sigmoid
from scipy.sparse import issparse, csr_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed
In [ ]:
sys.path.append('src/')
sys.path.append('src/models')
from MLC import MLC, objective, risk_pclassification, DataHelper
from tools import create_dataset, dataset_names, nLabels_dict, f1_score_nowarn
In [ ]:
dataset_names
In [ ]:
data_ix = 3
In [ ]:
dataset_name = dataset_names[data_ix]
nLabels = nLabels_dict[dataset_name]
print(dataset_name, nLabels)
In [ ]:
data_dir = 'data'
SEED = 918273645
fmodel_base = os.path.join(data_dir, 'pc-' + dataset_name + '-base.pkl')
fmodel_prec = os.path.join(data_dir, 'pc-' + dataset_name + '-prec.pkl')
fmodel_f1 = os.path.join(data_dir, 'pc-' + dataset_name + '-f1.pkl')
Load data.
In [ ]:
X_train, Y_train = create_dataset(dataset_name, train_data=True, shuffle=True, random_state=SEED)
X_test, Y_test = create_dataset(dataset_name, train_data=False)
In [ ]:
Y_train = csr_matrix(Y_train)
Feature normalisation.
In [ ]:
X_train_mean = np.mean(X_train, axis=0).reshape((1, -1))
X_train_std = np.std(X_train, axis=0).reshape((1, -1)) + 10 ** (-6)
X_train -= X_train_mean
X_train /= X_train_std
X_test -= X_train_mean
X_test /= X_train_std
In [ ]:
def print_dataset_info(X_train, Y_train, X_test, Y_test):
N_train, D = X_train.shape
K = Y_train.shape[1]
N_test = X_test.shape[0]
print('%-45s %s' % ('Number of training examples:', '{:,}'.format(N_train)))
print('%-45s %s' % ('Number of test examples:', '{:,}'.format(N_test)))
print('%-45s %s' % ('Number of features:', '{:,}'.format(D)))
print('%-45s %s' % ('Number of labels:', '{:,}'.format(K)))
avgK_train = np.mean(np.sum(Y_train, axis=1))
avgK_test = np.mean(np.sum(Y_test, axis=1))
print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (train):', avgK_train, 100*avgK_train / K))
print('%-45s %.3f (%.2f%%)' % ('Average number of positive labels (test):', avgK_test, 100*avgK_test / K))
#print('%-45s %.4f%%' % ('Average label occurrence (train):', np.mean(np.sum(Y_train, axis=0)) / N_train))
#print('%-45s %.4f%%' % ('Average label occurrence (test):', np.mean(np.sum(Y_test, axis=0)) / N_test))
print('%-45s %.3f%%' % ('Sparsity (percent) (train):', 100 * np.sum(Y_train) / np.prod(Y_train.shape)))
print('%-45s %.3f%%' % ('Sparsity (percent) (test):', 100 * np.sum(Y_test) / np.prod(Y_test.shape)))
In [ ]:
print('%-45s %s' % ('Dataset:', dataset_name))
print_dataset_info(X_train, Y_train, X_test, Y_test)
In [ ]:
def avgF1(Y_true, Y_pred):
# thresholds
#THs = [0, 0.05, 0.10, 0.15, 0.2, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.70, 0.75] # SPEN
THs = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85]
F1 = Parallel(n_jobs=-1)(delayed(f1_score_nowarn)(Y_true, Y_pred >= th, average='samples') for th in THs)
bestix = np.argmax(F1)
print('best threshold: %g, best F1: %g, #examples: %g' % (THs[bestix], F1[bestix], Y_true.shape[0]))
return F1[bestix]
In [ ]:
C_set = [0.01, 0.1, 1, 10, 100, 1000] # bibtex, bookmarks level 1
p_set = [1, 2, 3, 4, 5, 6]
parameters = [{'C1': C_set, 'p': p_set}]
scorer = {'F1': make_scorer(avgF1)}
In [ ]:
clf = GridSearchCV(MLC(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='F1')
clf.fit(X_train, Y_train, verbose=1)
#pkl.dump(clf, open(fmodel_f1, 'wb'))
In [ ]:
clf.cv_results_['mean_test_F1'].reshape(len(C_set), len(p_set))
In [ ]:
# CV results for bibtex
best_C1 = 100
best_p = 3
best_threshold = 0.5
clf = MLC(C1=best_C1, p=best_p)
clf.fit(X_train, Y_train, verbose=0, njobs=1)
clf.best_threshold = best_threshold
In [ ]:
pkl.dump(clf, open('data/mlc_%s.pkl' % dataset_name, 'wb'))
In [ ]:
clf = pkl.load(open('data/mlc_%s.pkl' % dataset_name, 'rb'))
In [ ]:
f1_score_nowarn(Y_test, clf.decision_function(X_test) >= clf.best_threshold, average='samples')
In [ ]:
f1_score_nowarn(Y_test, clf.decision_function(X_test) >= clf.best_threshold, average='macro')
In [ ]:
sys.path.append('src/tools')
from evaluate import calc_RPrecision, calc_RPrecision_HitRate
In [ ]:
rps, ind = calc_RPrecision(Y_test, clf.decision_function(X_test), axis=0)
print(np.mean(rps), len(ind), Y_test.shape[0])
In [ ]:
def calc_RP(Y_true, Y_pred):
assert Y_true.shape == Y_pred.shape
rps = []
for j in range(Y_true.shape[1]):
y_true = Y_true[:, j]
y_pred = Y_pred[:, j]
rp, _ = calc_RPrecision_HitRate(y_true, y_pred)
rps.append(rp)
return rps
In [ ]:
rps = calc_RP(Y_test.T, clf.decision_function(X_test).T)
print(np.mean(rps), len(rps), Y_test.shape[0])
In [ ]:
rps, ind = calc_RPrecision(Y_test, clf.decision_function(X_test), axis=1)
print(np.mean(rps), len(ind), Y_test.shape[1])
In [ ]:
rps = calc_RP(Y_test, clf.decision_function(X_test))
print(np.mean(rps), len(rps), Y_test.shape[1])