Playlist generation by multilable learning (P-Classification)


In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys, time
import pickle as pkl
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score, make_scorer, label_ranking_loss
from scipy.sparse import lil_matrix, issparse

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sys.path.append('src')
from PClassificationMLC import PClassificationMLC
from BinaryRelevance import BinaryRelevance
from evaluate import evaluatePrecision, evalPred, avgPrecisionK, f1_score_nowarn

In [ ]:
data_dir = 'data/aotm-2011'
fxtrain = os.path.join(data_dir, 'X_train_audio.pkl')
fytrain = os.path.join(data_dir, 'Y_train_audio.pkl')
fxdev   = os.path.join(data_dir, 'X_dev_audio.pkl')
fydev   = os.path.join(data_dir, 'Y_dev_audio.pkl')
fxtest  = os.path.join(data_dir, 'X_test_audio.pkl')
fytest  = os.path.join(data_dir, 'Y_test_audio.pkl')

Data loading


In [ ]:
X_train = pkl.load(open(fxtrain, 'rb'))
Y_train = pkl.load(open(fytrain, 'rb'))
X_dev   = pkl.load(open(fxdev,   'rb'))
Y_dev   = pkl.load(open(fydev,   'rb'))
X_test  = pkl.load(open(fxtest,  'rb'))
Y_test  = pkl.load(open(fytest,  'rb'))

In [ ]:
print('Train: %15s %15s' % (X_train.shape, Y_train.shape))
print('Dev  : %15s %15s' % (X_dev.shape,   Y_dev.shape))
print('Test : %15s %15s' % (X_test.shape,  Y_test.shape))

Columns with all zeros.


In [ ]:
#Ks = np.dot(Y_train, np.ones(Y_train.shape[1]))
nullcols = []
K = Y_train.shape[1]
for k in np.arange(K):
    kpos = np.sum(Y_train[:,k])
    if kpos == 0: 
        nullcols.append(k)
    if (k+1) % 100 == 0:
        sys.stdout.write('\r%d / %d' % (k+1, K))
        sys.stdout.flush()

In [ ]:
len(nullcols)

Train.


In [ ]:
clf = PClassificationMLC(C=100, p=2, weighting=True)
clf.fit_SGD(X_train, Y_train, batch_size=1000, n_epochs=20, learning_rate=0.05)

In [ ]:
clf1 = PClassificationMLC(C=100, p=2, weighting=True)
clf1.fit_SGD(X_train, Y_train, w=w0, batch_size=200, n_epochs=10, learning_rate=0.05)

In [ ]:
def evaluate_SGD(clf, eval_func, X_test, Y_test, threshold=None, batch_size=100):
    assert X_test.shape[0] == Y_test.shape[0]
    
    N = X_test.shape[0]
    metrics_all = []
    n_batches = int((N-1) / batch_size) + 1
    indices = np.arange(N)
    
    for nb in range(n_batches):
        sys.stdout.write('\r %d / %d' % (nb+1, n_batches))
        sys.stdout.flush()
        
        ix_start = nb * batch_size
        ix_end = min((nb+1) * batch_size, N)
        ix = indices[ix_start:ix_end]
        
        X = X_test[ix]
        Y_true = Y_test[ix]
        if issparse(Y_true):
            Y_true = Y_true.toarray()
        Y_pred = clf.decision_function(X)
        if issparse(Y_pred):
            Y_pred = Y_pred.toarray()
        if threshold is not None:
            Y_pred = Y_pred >= threshold
            
        metrics = eval_func(Y_true, Y_pred)
        metrics_all = np.concatenate((metrics_all, metrics), axis=-1)
        
    return metrics_all

In [ ]:
def calc_F1(Y_true, Y_pred):
    """
    Compute F1 scores for multilabel prediction, one score for each example.
    precision = true_positive / n_true
    recall = true_positive / n_positive
    f1 = (2 * precision * recall) / (precision + recall) = 2 * true_positive / (n_true + n_positive)
    """
    assert Y_true.shape == Y_pred.shape
    N, K = Y_true.shape
    OneK = np.ones(K)
    
    n_true = np.dot(Y_true, OneK)
    n_positive = np.dot(Y_pred, OneK)
    true_positive = np.dot(np.multiply(Y_true, Y_pred), OneK)
    
    numerator = 2 * true_positive
    denominator = n_true + n_positive
    nonzero_ix = np.nonzero(denominator)[0]
    
    f1 = np.zeros(N)
    f1[nonzero_ix] = np.divide(numerator[nonzero_ix], denominator[nonzero_ix])
    
    return f1

In [17]:
from evaluate import calc_F1
# test `calc_F1`
n = 50
k = 100000
for i in range(100):
    sys.stdout.write('\r%d / %d' % (i+1, 100))
    sys.stdout.flush()
    
    y0 = np.random.rand(n, k) >= 0.9996
    #print('\navg #positives:', np.mean(np.sum(y0, axis=1)))
    y1 = np.random.randn(n, k) >= 0.99
    #print('avg #positives:', np.mean(np.sum(y1, axis=1)))
    pk1 = f1_score_nowarn(y0, y1, average='samples')
    pk2 = np.mean(calc_F1(y0, y1))
    #print(pk1)
    #print(pk2)
    assert np.isclose(pk1, pk2)


100 / 100

In [ ]:
def calc_precisionK(Y_true, Y_pred):
    """
    Compute Precision@K, one score for each example.
    - thresholding predictions using the K-th largest predicted score, K is #positives in ground truth
    - Precision@K: #true_positives / #positives_in_ground_truth
      where by the definition of Precision@K, #positives_in_ground_truth = #positive_in_prediction
    """
    assert Y_true.shape == Y_pred.shape
    N, K = Y_true.shape
    OneK = np.ones(K)
    KPosAll = np.dot(Y_true, OneK).astype(np.int)
    assert np.all(KPosAll > 0)
    
    rows = np.arange(N)
    sortedIx = np.argsort(-Y_pred, axis=1)
    cols = sortedIx[rows, KPosAll-1]  # index of thresholds (the K-th largest scores, NOTE index starts at 0)
    thresholds = Y_pred[rows, cols]   # the K-th largest scores
    Y_pred_bin = Y_pred >= thresholds[:, None]  # convert scores to binary predictions
    
    true_positives = np.multiply(Y_true, Y_pred_bin)
    return np.dot(true_positives, OneK) / KPosAll

In [18]:
from evaluate import calc_precisionK
# test `calc_precisionK`
n = 50
k = 100000
for i in range(100):
    sys.stdout.write('\r%d / %d' % (i+1, 100))
    sys.stdout.flush()
    
    y0 = np.random.rand(n, k) >= 0.9996
    #print('\navg K:', np.mean(np.sum(y0, axis=1)))
    y1 = np.random.randn(n, k)
    pk1 = avgPrecisionK(y0, y1)
    pk2 = np.mean(calc_precisionK(y0, y1))
    assert np.isclose(pk1, pk2)


100 / 100

In [ ]:
THs = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85]

In [ ]:
for th in THs:
    metrics = evaluate(clf=clf, eval_func=calcF1, X_test=X_test, Y_test=Y_test, threshold=th, batch_size=500)
    print(' Threshold: %.2g, F1: %g' % (th, np.mean(metrics)))

In [ ]:
#import inspect
#inspect.signature(evaluate).parameters.keys()
#'threshold' in inspect.signature(evaluate).parameters.keys()

In [ ]:
# NOT needed, we can train in parallel
class GridSearch():
    """Validation by grid search, supported params: C, p"""
    
    def __init__(self, estimator, param_grid, scorer):
        self.estimator = estimator
        if self.check_grid(param_grid):
            self.param_grid = param_grid
        self.scorer = scorer
        
    def check_grid(self, grid):
        """Currently only support a few specific parameters"""
        if len(grid) > 1: 
            print('Please put all parameter configurations in the first dictionary!')
            return False
        
        supported = ['C', 'p']
        #for d in grid:
        #for key in d.keys():
        for key in grid[0].keys():
            if key not in supported:
                print('Parameter %s NOT supported!' % key)
                return False
        return True
            
    def fit(self, X_train, Y_train, X_dev, Y_dev):
        results = dict()
        params = self.param_grid[0].keys()
        if len(params) > 1:
            for C in params['C']:
                for p in params['p']:
                    clf = self.estimator(X_train=X_train, Y_train=Y_train, C=C, p=p)

In [ ]:
#ranges = range(-6, 7)
#ranges = range(-6, 5)
#parameters = [{'C': sorted([10**(e) for e in ranges] + [3 * 10**(e) for e in ranges]),
               #'r': [0.5, 1, 2, 4]}]
#scorer = {'Prec': make_scorer(avgPrecisionK)}

In [ ]:
#clf1 = GridSearchCV(PClassificationMLC(), parameters, scoring=scorer, cv=5, n_jobs=1, refit='Prec')
#clf1.fit(X1_train, Y1_train)