In [1]:
%matplotlib inline

In [2]:
import sys

import numpy as np
import matplotlib.pyplot as plt

import os
os.environ['THEANO_FLAGS']='floatX=float32,device=gpu,nvcc.fastmath=True'

import theano
import theano.tensor as T
tf = theano.config.floatX

from math import sqrt
from scipy import io
from sklearn.metrics import roc_auc_score, average_precision_score


Using gpu device 0: GeForce GRID K520

Load data


In [3]:
DATA_DIR = '/dataeast/dawen/'

In [4]:
K = 1024

In [5]:
X_train = np.load(os.path.join(DATA_DIR, 'X_train_K%d.npy' % K))
X_test = np.load(os.path.join(DATA_DIR, 'X_test_K%d.npy' % K))
y_train = np.load(os.path.join(DATA_DIR, 'y_train.npy'))
y_test = np.load(os.path.join(DATA_DIR, 'y_test.npy'))

y_train[y_train == 0] = -1
y_test[y_test == 0] = -1

In [6]:
#X_train = X_train / np.sum(X_train, axis=1, keepdims=True).astype(tf)
#X_test = X_test / np.sum(X_test, axis=1, keepdims=True).astype(tf)

# for numpy 1.6
X_train = X_train / (np.sum(X_train, axis=1)[:, None]).astype(tf)
X_test = X_test / (np.sum(X_test, axis=1)[:, None]).astype(tf)

Define the (single-layered) network


In [7]:
rng = np.random.RandomState(seed=98765)

In [8]:
n_samples, n_feats = X_train.shape
n_tags = y_train.shape[1]

batch_size = 100

X_batch = theano.shared(np.zeros((batch_size, n_feats), dtype=tf), 'X_batch', borrow=True)
y_batch = theano.shared(np.zeros((batch_size, n_tags), dtype=np.int32), 'y_batch', borrow=True)

In [9]:
class LogisticRegressionLayer:
    def __init__(self, rng, input, n_in, n_out):
        # input and params
        self.input = input
        
        self.W = theano.shared(0.01 * rng.randn(n_in, n_out).astype(tf), name='W', borrow=True)
        self.b = theano.shared(0.01 * rng.randn(n_out).astype(tf), name='b', borrow=True)
        
        self.params = [self.W, self.b]

        # AdaGrad 
        self.hist_grad_W = theano.shared(np.zeros((n_in, n_out), dtype=tf), name='hist_grad_W', borrow=True)
        self.hist_grad_b = theano.shared(np.zeros(n_out, dtype=tf), name='hist_grad_b', borrow=True)
        
        self.hist_grads = [self.hist_grad_W, self.hist_grad_b]
        
        # output
        self.linout = T.dot(self.input, self.W) + self.b
        
        self.L2_sq = T.sum(T.sqr(self.W)) + T.sum(T.sqr(self.b))
        
    def neg_log_likeli(self, y):
        return T.mean(T.nnet.softplus(T.neg(y * self.linout)))
    
    def set_params(self, W, b):
        self.W.set_value(W.astype(tf))
        self.b.set_value(b.astype(tf))

In [10]:
lam = T.scalar(name='lambda')

logistic_layer = LogisticRegressionLayer(rng, X_batch, n_feats, n_tags)
params = logistic_layer.params
hist_grads = logistic_layer.hist_grads

In [11]:
cost = logistic_layer.neg_log_likeli(y_batch) + lam * logistic_layer.L2_sq
grads = T.grad(cost, params)

In [12]:
eta = T.scalar(name='eta')
  
eps = 1e-10

updates = []
for param_i, grad_i, hist_grad_i in zip(params, grads, hist_grads):
    new_hist_grad_i = hist_grad_i + T.sqr(grad_i)
    updates.append((hist_grad_i, new_hist_grad_i))
    updates.append((param_i, param_i - eta / (eps + T.sqrt(new_hist_grad_i)) * grad_i))
    
update_f = theano.function(inputs=[eta, lam], outputs=[logistic_layer.linout, cost], updates=updates)

In [13]:
def predict(X, W, b, y):
    h = X.dot(W.get_value()) + b.get_value()
    cost = lam * (np.sum(W.get_value()**2) + np.sum(b.get_value()**2))
    cost += np.mean(np.log(1 + np.exp(-y * h)))
    return cost, h

In [14]:
cost_train = []
cost_test = []
acc_train = []
acc_test = []

logistic_layer.set_params(0.01 * rng.randn(n_feats, n_tags), 0.01 * rng.randn(n_tags))
if np.any(hist_grads[0].get_value()):
    hist_grads[0].set_value(np.zeros((n_feats, n_tags)).astype(tf))
    hist_grads[1].set_value(np.zeros(n_tags).astype(tf))

In [15]:
eta = 1.
lam = 0

In [16]:
nepochs = 50
print 'epoch\tcost.train\tcost.test\ttrain.acc\ttest.acc'
sys.stdout.flush()

for epoch in xrange(nepochs):
    # Execute stochastic gradient updates
    for i in xrange(0, n_samples, batch_size):
        inds = np.arange(i, min(i + batch_size, n_samples), dtype=int)
        X_batch.set_value(X_train[inds].astype(tf))
        y_batch.set_value(y_train[inds].astype(np.int32))
        update_f(eta, lam)
    cost, pred_train = predict(X_train, logistic_layer.W, logistic_layer.b, y_train)
    cost_train.append(cost)
    acc_train.append(1 - np.mean(np.logical_xor(pred_train > 0, y_train > 0)))
    
    cost, pred_test = predict(X_test, logistic_layer.W, logistic_layer.b, y_test)
    cost_test.append(cost)
    acc_test.append(1 - np.mean(np.logical_xor(pred_test > 0, y_test > 0)))
    
    pred_test = 1. / (1 + np.exp(-pred_test))
            
    print('%d\t%.3e\t%.3e\t%.3e\t%.3e' % (epoch, cost_train[-1], cost_test[-1], acc_train[-1], acc_test[-1]))
    sys.stdout.flush()


epoch	cost.train	cost.test	train.acc	test.acc
0	3.931e-02	1.679e-01	9.887e-01	9.544e-01
15	3.801e-02	1.655e-01	9.887e-01	9.546e-01
16	3.799e-02	1.655e-01	9.887e-01	9.546e-01
17	3.798e-02	1.655e-01	9.887e-01	9.546e-01
18	3.796e-02	1.655e-01	9.887e-01	9.546e-01
19	3.795e-02	1.655e-01	9.887e-01	9.546e-01
20	3.793e-02	1.655e-01	9.887e-01	9.546e-01
21	3.792e-02	1.655e-01	9.887e-01	9.546e-01
22	3.791e-02	1.655e-01	9.887e-01	9.546e-01
23	3.790e-02	1.655e-01	9.887e-01	9.546e-01
24	3.788e-02	1.655e-01	9.887e-01	9.546e-01
25	3.787e-02	1.655e-01	9.887e-01	9.546e-01
26	3.786e-02	1.655e-01	9.887e-01	9.546e-01
27	3.786e-02	1.655e-01	9.887e-01	9.546e-01
28	3.785e-02	1.655e-01	9.887e-01	9.546e-01
29	3.784e-02	1.655e-01	9.887e-01	9.546e-01
30	3.783e-02	1.655e-01	9.887e-01	9.546e-01
31	3.782e-02	1.656e-01	9.887e-01	9.546e-01
32	3.782e-02	1.656e-01	9.887e-01	9.546e-01
33	3.781e-02	1.656e-01	9.887e-01	9.546e-01
34	3.780e-02	1.656e-01	9.887e-01	9.546e-01
35	3.780e-02	1.656e-01	9.887e-01	9.546e-01
36	3.779e-02	1.656e-01	9.887e-01	9.546e-01
37	3.779e-02	1.656e-01	9.887e-01	9.546e-01
38	3.778e-02	1.656e-01	9.887e-01	9.546e-01
39	3.777e-02	1.656e-01	9.887e-01	9.546e-01
40	3.777e-02	1.656e-01	9.887e-01	9.546e-01
41	3.776e-02	1.657e-01	9.887e-01	9.546e-01
42	3.776e-02	1.657e-01	9.887e-01	9.546e-01
43	3.775e-02	1.657e-01	9.887e-01	9.546e-01
44	3.775e-02	1.657e-01	9.887e-01	9.546e-01
45	3.775e-02	1.657e-01	9.887e-01	9.546e-01
46	3.774e-02	1.657e-01	9.887e-01	9.546e-01
47	3.774e-02	1.657e-01	9.887e-01	9.546e-01
48	3.773e-02	1.657e-01	9.887e-01	9.546e-01
49	3.773e-02	1.657e-01	9.887e-01	9.546e-01

In [17]:
plt.plot(acc_train)
plt.plot(acc_test)


Out[17]:
[<matplotlib.lines.Line2D at 0x987ad50>]

Evaluation


In [18]:
# compute evaluation metrics
def construct_pred_mask(tags_predicted, predictat):
    n_samples, n_tags = tags_predicted.shape
    rankings = np.argsort(-tags_predicted, axis=1)[:, :predictat]
    tags_predicted_binary = np.zeros_like(tags_predicted, dtype=bool)
    for i in xrange(n_samples):
        tags_predicted_binary[i, rankings[i]] = 1
    return tags_predicted_binary

def per_tag_prec_recall(tags_predicted_binary, tags_true_binary):
    mask = np.logical_and(tags_predicted_binary, tags_true_binary)
    prec = mask.sum(axis=0) / (tags_predicted_binary.sum(axis=0) + np.spacing(1))
    tags_true_count = tags_true_binary.sum(axis=0).astype(float)
    idx = (tags_true_count > 0)
    recall = mask.sum(axis=0)[idx] / tags_true_count[idx]
    return prec, recall


def aroc_ap(tags_true_binary, tags_predicted):
    n_tags = tags_true_binary.shape[1]
    
    auc = list()
    aprec = list()
    for i in xrange(n_tags):
        if np.sum(tags_true_binary[:, i]) != 0:
            auc.append(roc_auc_score(tags_true_binary[:, i], tags_predicted[:, i]))
            aprec.append(average_precision_score(tags_true_binary[:, i], tags_predicted[:, i]))
    return auc, aprec


def print_out_metrics(tags_true_binary, tags_predicted, predictat):
    tags_predicted_binary = construct_pred_mask(tags_predicted, predictat)
    prec, recall = per_tag_prec_recall(tags_predicted_binary, tags_true_binary)
    mprec, mrecall = np.mean(prec), np.mean(recall)
    
    print 'Precision = %.3f (%.3f)' % (mprec, np.std(prec) / sqrt(prec.size))
    print 'Recall = %.3f (%.3f)' % (mrecall, np.std(recall) / sqrt(recall.size))
    print 'F-score = %.3f' % (2 * mprec * mrecall / (mprec + mrecall))

    auc, aprec = aroc_ap(tags_true_binary, tags_predicted)
    print 'AROC = %.3f (%.3f)' % (np.mean(auc), np.std(auc) / sqrt(len(auc)))
    print 'AP = %.3f (%.3f)' % (np.mean(aprec), np.std(aprec) / sqrt(len(aprec)))

In [19]:
tags_predicted = pred_test
print tags_predicted.min(), tags_predicted.max()

div_factor = 1.25
tags_predicted = tags_predicted - div_factor * np.mean(tags_predicted, axis=0)


3.31147e-20 0.943542

In [20]:
predictat = 20
tags_true_binary = (y_test > 0)

print_out_metrics(tags_true_binary, tags_predicted, predictat)


Precision = 0.182 (0.008)
Recall = 0.184 (0.008)
F-score = 0.183
AROC = 0.756 (0.005)
AP = 0.155 (0.007)

In [ ]: