In [1]:
%matplotlib inline

In [2]:
import sys

import numpy as np
import matplotlib.pyplot as plt

import os
os.environ['THEANO_FLAGS']='floatX=float32,device=gpu,nvcc.fastmath=True'

import theano
import theano.tensor as T
tf = theano.config.floatX

from math import sqrt
from scipy import io
from sklearn.metrics import roc_auc_score, average_precision_score


Using gpu device 0: GeForce GRID K520

Load data


In [3]:
K = 1024

In [4]:
DATA_DIR = '/dataeast/dawen/'

In [5]:
X_train = np.load(os.path.join(DATA_DIR, 'X_train_K%d.npy' % K))
X_test = np.load(os.path.join(DATA_DIR, 'X_test_K%d.npy' % K))
y_train = np.load(os.path.join(DATA_DIR, 'y_train.npy'))
y_test = np.load(os.path.join(DATA_DIR, 'y_test.npy'))

y_train[y_train == 0] = -1
y_test[y_test == 0] = -1

In [6]:
#X_train = X_train / np.sum(X_train, axis=1, keepdims=True).astype(tf)
#X_test = X_test / np.sum(X_test, axis=1, keepdims=True).astype(tf)

# for numpy 1.6
X_train = X_train / (np.sum(X_train, axis=1)[:, None]).astype(tf)
X_test = X_test / (np.sum(X_test, axis=1)[:, None]).astype(tf)

Define the network


In [7]:
rng = np.random.RandomState(seed=98765)

rng_CUDA = theano.sandbox.cuda.rng_curand.CURAND_RandomStreams(98765)

In [8]:
n_samples, n_feats = X_train.shape
n_tags = y_train.shape[1]

batch_size = 100

X_batch = theano.shared(np.zeros((batch_size, n_feats), dtype=tf), 'X_batch', borrow=True)
y_batch = theano.shared(np.zeros((batch_size, n_tags), dtype=np.int32), 'y_batch', borrow=True)

In [9]:
class LogisticRegressionLayer:
    def __init__(self, rng, input, n_in, n_out):
        # input and params
        self.input = input
        
        self.W = theano.shared(0.01 * rng.randn(n_in, n_out).astype(tf), name='W', borrow=True)
        self.b = theano.shared(0.01 * rng.randn(n_out).astype(tf), name='b', borrow=True)
        
        self.params = [self.W, self.b]

        # AdaGrad 
        self.hist_grad_W = theano.shared(np.zeros((n_in, n_out), dtype=tf), name='hist_grad_W', borrow=True)
        self.hist_grad_b = theano.shared(np.zeros(n_out, dtype=tf), name='hist_grad_b', borrow=True)
        
        self.hist_grads = [self.hist_grad_W, self.hist_grad_b]
        
        # output
        self.linout = T.dot(self.input, self.W) + self.b
        
        self.L2_sq = T.sum(T.sqr(self.W)) + T.sum(T.sqr(self.b))  
        
    def neg_log_likeli(self, y):
        return T.mean(T.nnet.softplus(T.neg(y * self.linout)))
           
        
class HiddenReluLayer(object):
    def __init__(self, rng, rng_CUDA, input, n_in, n_out, dropout):
        # input and params
        self.input = input
        self.dropout = dropout
        
        self.W = theano.shared(value=0.01 * rng.randn(n_in, n_out).astype(tf), name='W', borrow=True)
        self.b = theano.shared(value=0.01 * rng.randn(n_out).astype(tf), name='b', borrow=True)
        
        self.params = [self.W, self.b]
        
        # AdaGrad
        self.hist_grad_W = theano.shared(np.zeros((n_in, n_out), dtype=tf), name='hist_grad_W', borrow=True)
        self.hist_grad_b = theano.shared(np.zeros(n_out, dtype=tf), name='hist_grad_b', borrow=True)
        
        self.hist_grads = [self.hist_grad_W, self.hist_grad_b]
        
        # output
        lin_out = T.dot(self.input, self.W) + self.b
        if T.gt(self.dropout, 0):
            mask = 1.0 / (1.0 - self.dropout) * T.cast(rng_CUDA.uniform(size=(n_out, )) > self.dropout, dtype=tf)
            lin_out = lin_out * mask
        self.output = T.maximum(lin_out, 0)
        self.L2_sq = T.sum(T.sqr(self.W)) + T.sum(T.sqr(self.b))
        
    def activation(self, X):
        output = T.maximum(T.dot(X, self.W) + self.b, 0)
        return output

In [10]:
n_units = 1200

lam = T.scalar(name='lambda')
delta = T.scalar(name='delta')

layer0 = HiddenReluLayer(rng, rng_CUDA, input=X_batch, n_in=n_feats, n_out=n_units, dropout=delta)

layer1 = HiddenReluLayer(rng, rng_CUDA, input=layer0.output, n_in=n_units, n_out=n_units, dropout=delta)

layer2 = HiddenReluLayer(rng, rng_CUDA, input=layer1.output, n_in=n_units, n_out=n_units, dropout=delta)

layer3 = LogisticRegressionLayer(rng, input=layer2.output, n_in=n_units, n_out=n_tags)

layers = [layer0, layer1, layer2, layer3]

In [11]:
params = []
hist_grads = []

for layer in layers:
    params += layer.params
    hist_grads += layer.hist_grads

In [12]:
cost = layers[-1].neg_log_likeli(y_batch)

for layer in layers:
    cost = cost + lam * layer.L2_sq

grads = T.grad(cost, params)

In [13]:
eta = T.scalar(name='eta')
  
eps = 1e-8

updates = []

for param_i, grad_i, hist_grad_i in zip(params, grads, hist_grads):
    new_hist_grad_i = hist_grad_i + T.sqr(grad_i)
    updates.append((hist_grad_i, new_hist_grad_i))
    updates.append((param_i, param_i - eta / (eps + T.sqrt(new_hist_grad_i)) * grad_i))
    
update_f = theano.function(inputs=[eta, lam, delta], outputs=[layers[-1].linout, cost], updates=updates)

In [14]:
def _predict(X, y):
    cost = 0
    h = X
    for layer in layers[:-1]:
        h = h.dot(layer.W.get_value()) + layer.b.get_value()
        h = np.maximum(h, 0)
        cost += lam * (np.sum(layer.W.get_value()**2) + np.sum(layer.b.get_value()**2))
    h = h.dot(layers[-1].W.get_value()) + layers[-1].b.get_value()
    cost += np.mean(np.log(1 + np.exp(-y * h)))
    return h, cost

In [15]:
cost_train = []
cost_test = []
acc_train = []
acc_test = []

In [16]:
eta = .05
lam = 0
delta = 0.25

In [24]:
nepochs = 50
print 'epoch\tcost.train\tcost.test\ttrain.acc\ttest.acc'
sys.stdout.flush()

for epoch in xrange(nepochs):
    cost_batch = []
    acc_batch = []
    # Execute stochastic gradient updates
    for i in xrange(0, n_samples, batch_size):
        inds = np.arange(i, min(i + batch_size, n_samples), dtype=int)
        X_batch.set_value(X_train[inds].astype(tf))
        y_batch.set_value(y_train[inds].astype(np.int32))

        pred, cost = update_f(eta, lam, delta)
        cost_batch.append(cost)
        acc_batch.append(1 - np.mean(np.logical_xor(pred > 0, y_train[inds] > 0)))

    cost_train.append(np.mean(cost_batch))
    acc_train.append(np.mean(acc_batch))
    
    pred_test, cost = _predict(X_test, y_test)
    cost_test.append(cost)
    acc_test.append(1 - np.mean(np.logical_xor(pred_test > 0, y_test > 0)))
    
    pred_test = 1. / (1 + np.exp(-pred_test))
            
    print('%d\t%.3e\t%.3e\t%.3e\t%.3e' % (epoch, cost_train[-1], cost_test[-1], acc_train[-1], acc_test[-1]))
    sys.stdout.flush()


epoch	cost.train	cost.test	train.acc	test.acc
0	4.196e-02	1.654e-01	9.890e-01	9.550e-01
1	4.189e-02	1.663e-01	9.890e-01	9.550e-01
2	4.180e-02	1.668e-01	9.890e-01	9.550e-01
3	4.176e-02	1.658e-01	9.890e-01	9.551e-01
4	4.167e-02	1.669e-01	9.890e-01	9.550e-01
5	4.162e-02	1.677e-01	9.890e-01	9.550e-01
6	4.160e-02	1.673e-01	9.890e-01	9.551e-01
7	4.158e-02	1.672e-01	9.890e-01	9.550e-01
8	4.149e-02	1.670e-01	9.890e-01	9.550e-01
9	4.145e-02	1.680e-01	9.890e-01	9.551e-01
10	4.142e-02	1.684e-01	9.890e-01	9.551e-01
11	4.137e-02	1.696e-01	9.890e-01	9.550e-01
12	4.132e-02	1.669e-01	9.890e-01	9.551e-01
13	4.132e-02	1.713e-01	9.890e-01	9.550e-01
14	4.126e-02	1.693e-01	9.890e-01	9.550e-01
15	4.116e-02	1.694e-01	9.890e-01	9.551e-01
16	4.117e-02	1.692e-01	9.890e-01	9.551e-01
17	4.113e-02	1.701e-01	9.891e-01	9.550e-01
18	4.110e-02	1.702e-01	9.890e-01	9.550e-01
19	4.103e-02	1.690e-01	9.891e-01	9.551e-01
20	4.102e-02	1.711e-01	9.891e-01	9.550e-01
21	4.102e-02	1.691e-01	9.891e-01	9.551e-01
22	4.093e-02	1.694e-01	9.891e-01	9.551e-01
23	4.092e-02	1.687e-01	9.891e-01	9.551e-01
24	4.089e-02	1.690e-01	9.891e-01	9.551e-01
25	4.087e-02	1.702e-01	9.891e-01	9.551e-01
26	4.077e-02	1.691e-01	9.891e-01	9.551e-01
27	4.075e-02	1.702e-01	9.891e-01	9.551e-01
28	4.074e-02	1.700e-01	9.891e-01	9.551e-01
29	4.073e-02	1.702e-01	9.891e-01	9.552e-01
30	4.073e-02	1.699e-01	9.891e-01	9.551e-01
31	4.065e-02	1.694e-01	9.891e-01	9.552e-01
32	4.061e-02	1.697e-01	9.891e-01	9.552e-01
33	4.056e-02	1.699e-01	9.891e-01	9.551e-01
34	4.055e-02	1.709e-01	9.891e-01	9.551e-01
35	4.058e-02	1.720e-01	9.891e-01	9.551e-01
36	4.052e-02	1.718e-01	9.891e-01	9.551e-01
37	4.050e-02	1.705e-01	9.891e-01	9.551e-01
38	4.047e-02	1.688e-01	9.891e-01	9.552e-01
39	4.045e-02	1.703e-01	9.891e-01	9.552e-01
40	4.042e-02	1.728e-01	9.891e-01	9.551e-01
41	4.037e-02	1.721e-01	9.891e-01	9.551e-01
42	4.039e-02	1.723e-01	9.891e-01	9.551e-01
43	4.034e-02	1.718e-01	9.891e-01	9.551e-01
44	4.028e-02	1.719e-01	9.891e-01	9.551e-01
45	4.029e-02	1.706e-01	9.891e-01	9.552e-01
46	4.023e-02	1.727e-01	9.892e-01	9.551e-01
47	4.024e-02	1.702e-01	9.892e-01	9.552e-01
48	4.018e-02	1.718e-01	9.892e-01	9.552e-01
49	4.020e-02	1.709e-01	9.892e-01	9.552e-01

In [25]:
plt.plot(acc_train)
plt.plot(acc_test)


Out[25]:
[<matplotlib.lines.Line2D at 0xab95f10>]

Evaluation


In [26]:
# compute evaluation metrics
def construct_pred_mask(tags_predicted, predictat):
    n_samples, n_tags = tags_predicted.shape
    rankings = np.argsort(-tags_predicted, axis=1)[:, :predictat]
    tags_predicted_binary = np.zeros_like(tags_predicted, dtype=bool)
    for i in xrange(n_samples):
        tags_predicted_binary[i, rankings[i]] = 1
    return tags_predicted_binary

def per_tag_prec_recall(tags_predicted_binary, tags_true_binary):
    mask = np.logical_and(tags_predicted_binary, tags_true_binary)
    prec = mask.sum(axis=0) / (tags_predicted_binary.sum(axis=0) + np.spacing(1))
    tags_true_count = tags_true_binary.sum(axis=0).astype(float)
    idx = (tags_true_count > 0)
    recall = mask.sum(axis=0)[idx] / tags_true_count[idx]
    return prec, recall


def aroc_ap(tags_true_binary, tags_predicted):
    n_tags = tags_true_binary.shape[1]
    
    auc = list()
    aprec = list()
    for i in xrange(n_tags):
        if np.sum(tags_true_binary[:, i]) != 0:
            auc.append(roc_auc_score(tags_true_binary[:, i], tags_predicted[:, i]))
            aprec.append(average_precision_score(tags_true_binary[:, i], tags_predicted[:, i]))
    return auc, aprec


def print_out_metrics(tags_true_binary, tags_predicted, predictat):
    tags_predicted_binary = construct_pred_mask(tags_predicted, predictat)
    prec, recall = per_tag_prec_recall(tags_predicted_binary, tags_true_binary)
    mprec, mrecall = np.mean(prec), np.mean(recall)
    
    print 'Precision = %.3f (%.3f)' % (mprec, np.std(prec) / sqrt(prec.size))
    print 'Recall = %.3f (%.3f)' % (mrecall, np.std(recall) / sqrt(recall.size))
    print 'F-score = %.3f' % (2 * mprec * mrecall / (mprec + mrecall))

    auc, aprec = aroc_ap(tags_true_binary, tags_predicted)
    print 'AROC = %.3f (%.3f)' % (np.mean(auc), np.std(auc) / sqrt(len(auc)))
    print 'AP = %.3f (%.3f)' % (np.mean(aprec), np.std(aprec) / sqrt(len(aprec)))

In [27]:
tags_predicted = pred_test
print tags_predicted.min(), tags_predicted.max()

div_factor = 1.25
tags_predicted = tags_predicted - div_factor * np.mean(tags_predicted, axis=0)


7.20792e-14 0.911772

In [26]:
# results with dropout = 0.5 (50 epochs)
predictat = 20
tags_true_binary = (y_test > 0)

print_out_metrics(tags_true_binary, tags_predicted, predictat)


Precision = 0.184 (0.009)
Recall = 0.207 (0.009)
F-score = 0.195
AROC = 0.781 (0.005)
AP = 0.178 (0.007)

In [21]:
# results with dropout = 0.25 (50 epochs)
predictat = 20
tags_true_binary = (y_test > 0)

print_out_metrics(tags_true_binary, tags_predicted, predictat)


Precision = 0.192 (0.009)
Recall = 0.212 (0.009)
F-score = 0.202
AROC = 0.777 (0.005)
AP = 0.177 (0.007)

In [29]:
# results with dropout = 0.25 (100 epochs)
predictat = 20
tags_true_binary = (y_test > 0)

print_out_metrics(tags_true_binary, tags_predicted, predictat)


Precision = 0.193 (0.008)
Recall = 0.215 (0.009)
F-score = 0.203
AROC = 0.775 (0.005)
AP = 0.172 (0.007)

In [ ]: