In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score, make_scorer, label_ranking_loss
from scipy.sparse import lil_matrix, issparse
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
sys.path.append('src')
from PClassificationMLC import PClassificationMLC
from BinaryRelevance import BinaryRelevance
from evaluate import evaluatePrecision, evalPred, avgPrecisionK, f1_score_nowarn
In [ ]:
data_dir = 'data/aotm-2011'
fxtrain = os.path.join(data_dir, 'X_train_audio.pkl')
fytrain = os.path.join(data_dir, 'Y_train_audio.pkl')
fxdev = os.path.join(data_dir, 'X_dev_audio.pkl')
fydev = os.path.join(data_dir, 'Y_dev_audio.pkl')
fxtest = os.path.join(data_dir, 'X_test_audio.pkl')
fytest = os.path.join(data_dir, 'Y_test_audio.pkl')
In [ ]:
X_train = pkl.load(open(fxtrain, 'rb'))
Y_train = pkl.load(open(fytrain, 'rb'))
X_dev = pkl.load(open(fxdev, 'rb'))
Y_dev = pkl.load(open(fydev, 'rb'))
X_test = pkl.load(open(fxtest, 'rb'))
Y_test = pkl.load(open(fytest, 'rb'))
In [ ]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev : %15s %15s' % (X_dev.shape, Y_dev.shape))
print('Test : %15s %15s' % (X_test.shape, Y_test.shape))
Columns with all zeros.
In [ ]:
#Ks = np.dot(Y_train, np.ones(Y_train.shape[1]))
nullcols = []
K = Y_train.shape[1]
for k in np.arange(K):
kpos = np.sum(Y_train[:,k])
if kpos == 0:
nullcols.append(k)
if (k+1) % 100 == 0:
sys.stdout.write('\r%d / %d' % (k+1, K))
sys.stdout.flush()
In [ ]:
len(nullcols)
Train.
In [ ]:
clf = PClassificationMLC(C=100, p=2, weighting=True)
clf.fit_SGD(X_train, Y_train, batch_size=1000, n_epochs=20, learning_rate=0.05)
In [ ]:
clf1 = PClassificationMLC(C=100, p=2, weighting=True)
clf1.fit_SGD(X_train, Y_train, w=w0, batch_size=200, n_epochs=10, learning_rate=0.05)
In [ ]:
def evaluate_SGD(clf, eval_func, X_test, Y_test, threshold=None, batch_size=100):
assert X_test.shape[0] == Y_test.shape[0]
N = X_test.shape[0]
metrics_all = []
n_batches = int((N-1) / batch_size) + 1
indices = np.arange(N)
for nb in range(n_batches):
sys.stdout.write('\r %d / %d' % (nb+1, n_batches))
sys.stdout.flush()
ix_start = nb * batch_size
ix_end = min((nb+1) * batch_size, N)
ix = indices[ix_start:ix_end]
X = X_test[ix]
Y_true = Y_test[ix]
if issparse(Y_true):
Y_true = Y_true.toarray()
Y_pred = clf.decision_function(X)
if issparse(Y_pred):
Y_pred = Y_pred.toarray()
if threshold is not None:
Y_pred = Y_pred >= threshold
metrics = eval_func(Y_true, Y_pred)
metrics_all = np.concatenate((metrics_all, metrics), axis=-1)
return metrics_all
In [ ]:
def calc_F1(Y_true, Y_pred):
"""
Compute F1 scores for multilabel prediction, one score for each example.
precision = true_positive / n_true
recall = true_positive / n_positive
f1 = (2 * precision * recall) / (precision + recall) = 2 * true_positive / (n_true + n_positive)
"""
assert Y_true.shape == Y_pred.shape
N, K = Y_true.shape
OneK = np.ones(K)
n_true = np.dot(Y_true, OneK)
n_positive = np.dot(Y_pred, OneK)
true_positive = np.dot(np.multiply(Y_true, Y_pred), OneK)
numerator = 2 * true_positive
denominator = n_true + n_positive
nonzero_ix = np.nonzero(denominator)[0]
f1 = np.zeros(N)
f1[nonzero_ix] = np.divide(numerator[nonzero_ix], denominator[nonzero_ix])
return f1
In [17]:
from evaluate import calc_F1
# test `calc_F1`
n = 50
k = 100000
for i in range(100):
sys.stdout.write('\r%d / %d' % (i+1, 100))
sys.stdout.flush()
y0 = np.random.rand(n, k) >= 0.9996
#print('\navg #positives:', np.mean(np.sum(y0, axis=1)))
y1 = np.random.randn(n, k) >= 0.99
#print('avg #positives:', np.mean(np.sum(y1, axis=1)))
pk1 = f1_score_nowarn(y0, y1, average='samples')
pk2 = np.mean(calc_F1(y0, y1))
#print(pk1)
#print(pk2)
assert np.isclose(pk1, pk2)
In [ ]:
def calc_precisionK(Y_true, Y_pred):
"""
Compute Precision@K, one score for each example.
- thresholding predictions using the K-th largest predicted score, K is #positives in ground truth
- Precision@K: #true_positives / #positives_in_ground_truth
where by the definition of Precision@K, #positives_in_ground_truth = #positive_in_prediction
"""
assert Y_true.shape == Y_pred.shape
N, K = Y_true.shape
OneK = np.ones(K)
KPosAll = np.dot(Y_true, OneK).astype(np.int)
assert np.all(KPosAll > 0)
rows = np.arange(N)
sortedIx = np.argsort(-Y_pred, axis=1)
cols = sortedIx[rows, KPosAll-1] # index of thresholds (the K-th largest scores, NOTE index starts at 0)
thresholds = Y_pred[rows, cols] # the K-th largest scores
Y_pred_bin = Y_pred >= thresholds[:, None] # convert scores to binary predictions
true_positives = np.multiply(Y_true, Y_pred_bin)
return np.dot(true_positives, OneK) / KPosAll
In [18]:
from evaluate import calc_precisionK
# test `calc_precisionK`
n = 50
k = 100000
for i in range(100):
sys.stdout.write('\r%d / %d' % (i+1, 100))
sys.stdout.flush()
y0 = np.random.rand(n, k) >= 0.9996
#print('\navg K:', np.mean(np.sum(y0, axis=1)))
y1 = np.random.randn(n, k)
pk1 = avgPrecisionK(y0, y1)
pk2 = np.mean(calc_precisionK(y0, y1))
assert np.isclose(pk1, pk2)
In [ ]:
THs = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85]
In [ ]:
for th in THs:
metrics = evaluate(clf=clf, eval_func=calcF1, X_test=X_test, Y_test=Y_test, threshold=th, batch_size=500)
print(' Threshold: %.2g, F1: %g' % (th, np.mean(metrics)))
In [ ]:
#import inspect
#inspect.signature(evaluate).parameters.keys()
#'threshold' in inspect.signature(evaluate).parameters.keys()
In [ ]:
# NOT needed, we can train in parallel
class GridSearch():
"""Validation by grid search, supported params: C, p"""
def __init__(self, estimator, param_grid, scorer):
self.estimator = estimator
if self.check_grid(param_grid):
self.param_grid = param_grid
self.scorer = scorer
def check_grid(self, grid):
"""Currently only support a few specific parameters"""
if len(grid) > 1:
print('Please put all parameter configurations in the first dictionary!')
return False
supported = ['C', 'p']
#for d in grid:
#for key in d.keys():
for key in grid[0].keys():
if key not in supported:
print('Parameter %s NOT supported!' % key)
return False
return True
def fit(self, X_train, Y_train, X_dev, Y_dev):
results = dict()
params = self.param_grid[0].keys()
if len(params) > 1:
for C in params['C']:
for p in params['p']:
clf = self.estimator(X_train=X_train, Y_train=Y_train, C=C, p=p)
In [ ]:
#ranges = range(-6, 7)
#ranges = range(-6, 5)
#parameters = [{'C': sorted([10**(e) for e in ranges] + [3 * 10**(e) for e in ranges]),
#'r': [0.5, 1, 2, 4]}]
#scorer = {'Prec': make_scorer(avgPrecisionK)}
In [ ]:
#clf1 = GridSearchCV(PClassificationMLC(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='Prec')
#clf1.fit(X1_train, Y1_train)