In [1]:
%matplotlib inline
In [2]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import os
os.environ['THEANO_FLAGS']='floatX=float32,device=gpu,nvcc.fastmath=True'
import theano
import theano.tensor as T
tf = theano.config.floatX
from math import sqrt
from scipy import io
from sklearn.metrics import roc_auc_score, average_precision_score
In [3]:
K = 1024
In [4]:
DATA_DIR = '/dataeast/dawen/'
In [5]:
X_train = np.load(os.path.join(DATA_DIR, 'X_train_K%d.npy' % K))
X_test = np.load(os.path.join(DATA_DIR, 'X_test_K%d.npy' % K))
y_train = np.load(os.path.join(DATA_DIR, 'y_train.npy'))
y_test = np.load(os.path.join(DATA_DIR, 'y_test.npy'))
y_train[y_train == 0] = -1
y_test[y_test == 0] = -1
In [6]:
#X_train = X_train / np.sum(X_train, axis=1, keepdims=True).astype(tf)
#X_test = X_test / np.sum(X_test, axis=1, keepdims=True).astype(tf)
# for numpy 1.6
X_train = X_train / (np.sum(X_train, axis=1)[:, None]).astype(tf)
X_test = X_test / (np.sum(X_test, axis=1)[:, None]).astype(tf)
In [7]:
rng = np.random.RandomState(seed=98765)
rng_CUDA = theano.sandbox.cuda.rng_curand.CURAND_RandomStreams(98765)
In [8]:
n_samples, n_feats = X_train.shape
n_tags = y_train.shape[1]
batch_size = 100
X_batch = theano.shared(np.zeros((batch_size, n_feats), dtype=tf), 'X_batch', borrow=True)
y_batch = theano.shared(np.zeros((batch_size, n_tags), dtype=np.int32), 'y_batch', borrow=True)
In [9]:
class LogisticRegressionLayer:
def __init__(self, rng, input, n_in, n_out):
# input and params
self.input = input
self.W = theano.shared(0.01 * rng.randn(n_in, n_out).astype(tf), name='W', borrow=True)
self.b = theano.shared(0.01 * rng.randn(n_out).astype(tf), name='b', borrow=True)
self.params = [self.W, self.b]
# AdaGrad
self.hist_grad_W = theano.shared(np.zeros((n_in, n_out), dtype=tf), name='hist_grad_W', borrow=True)
self.hist_grad_b = theano.shared(np.zeros(n_out, dtype=tf), name='hist_grad_b', borrow=True)
self.hist_grads = [self.hist_grad_W, self.hist_grad_b]
# output
self.linout = T.dot(self.input, self.W) + self.b
self.L2_sq = T.sum(T.sqr(self.W)) + T.sum(T.sqr(self.b))
def neg_log_likeli(self, y):
return T.mean(T.nnet.softplus(T.neg(y * self.linout)))
class HiddenReluLayer(object):
def __init__(self, rng, rng_CUDA, input, n_in, n_out, dropout):
# input and params
self.input = input
self.dropout = dropout
self.W = theano.shared(value=0.01 * rng.randn(n_in, n_out).astype(tf), name='W', borrow=True)
self.b = theano.shared(value=0.01 * rng.randn(n_out).astype(tf), name='b', borrow=True)
self.params = [self.W, self.b]
# AdaGrad
self.hist_grad_W = theano.shared(np.zeros((n_in, n_out), dtype=tf), name='hist_grad_W', borrow=True)
self.hist_grad_b = theano.shared(np.zeros(n_out, dtype=tf), name='hist_grad_b', borrow=True)
self.hist_grads = [self.hist_grad_W, self.hist_grad_b]
# output
lin_out = T.dot(self.input, self.W) + self.b
if T.gt(self.dropout, 0):
mask = 1.0 / (1.0 - self.dropout) * T.cast(rng_CUDA.uniform(size=(n_out, )) > self.dropout, dtype=tf)
lin_out = lin_out * mask
self.output = T.maximum(lin_out, 0)
self.L2_sq = T.sum(T.sqr(self.W)) + T.sum(T.sqr(self.b))
def activation(self, X):
output = T.maximum(T.dot(X, self.W) + self.b, 0)
return output
In [10]:
n_units = 1200
lam = T.scalar(name='lambda')
delta = T.scalar(name='delta')
layer0 = HiddenReluLayer(rng, rng_CUDA, input=X_batch, n_in=n_feats, n_out=n_units, dropout=delta)
layer1 = HiddenReluLayer(rng, rng_CUDA, input=layer0.output, n_in=n_units, n_out=n_units, dropout=delta)
layer2 = HiddenReluLayer(rng, rng_CUDA, input=layer1.output, n_in=n_units, n_out=n_units, dropout=delta)
layer3 = LogisticRegressionLayer(rng, input=layer2.output, n_in=n_units, n_out=n_tags)
layers = [layer0, layer1, layer2, layer3]
In [11]:
params = []
hist_grads = []
for layer in layers:
params += layer.params
hist_grads += layer.hist_grads
In [12]:
cost = layers[-1].neg_log_likeli(y_batch)
for layer in layers:
cost = cost + lam * layer.L2_sq
grads = T.grad(cost, params)
In [13]:
eta = T.scalar(name='eta')
eps = 1e-8
updates = []
for param_i, grad_i, hist_grad_i in zip(params, grads, hist_grads):
new_hist_grad_i = hist_grad_i + T.sqr(grad_i)
updates.append((hist_grad_i, new_hist_grad_i))
updates.append((param_i, param_i - eta / (eps + T.sqrt(new_hist_grad_i)) * grad_i))
update_f = theano.function(inputs=[eta, lam, delta], outputs=[layers[-1].linout, cost], updates=updates)
In [14]:
def _predict(X, y):
cost = 0
h = X
for layer in layers[:-1]:
h = h.dot(layer.W.get_value()) + layer.b.get_value()
h = np.maximum(h, 0)
cost += lam * (np.sum(layer.W.get_value()**2) + np.sum(layer.b.get_value()**2))
h = h.dot(layers[-1].W.get_value()) + layers[-1].b.get_value()
cost += np.mean(np.log(1 + np.exp(-y * h)))
return h, cost
In [15]:
cost_train = []
cost_test = []
acc_train = []
acc_test = []
In [16]:
eta = .05
lam = 0
delta = 0.25
In [24]:
nepochs = 50
print 'epoch\tcost.train\tcost.test\ttrain.acc\ttest.acc'
sys.stdout.flush()
for epoch in xrange(nepochs):
cost_batch = []
acc_batch = []
# Execute stochastic gradient updates
for i in xrange(0, n_samples, batch_size):
inds = np.arange(i, min(i + batch_size, n_samples), dtype=int)
X_batch.set_value(X_train[inds].astype(tf))
y_batch.set_value(y_train[inds].astype(np.int32))
pred, cost = update_f(eta, lam, delta)
cost_batch.append(cost)
acc_batch.append(1 - np.mean(np.logical_xor(pred > 0, y_train[inds] > 0)))
cost_train.append(np.mean(cost_batch))
acc_train.append(np.mean(acc_batch))
pred_test, cost = _predict(X_test, y_test)
cost_test.append(cost)
acc_test.append(1 - np.mean(np.logical_xor(pred_test > 0, y_test > 0)))
pred_test = 1. / (1 + np.exp(-pred_test))
print('%d\t%.3e\t%.3e\t%.3e\t%.3e' % (epoch, cost_train[-1], cost_test[-1], acc_train[-1], acc_test[-1]))
sys.stdout.flush()
In [25]:
plt.plot(acc_train)
plt.plot(acc_test)
Out[25]:
In [26]:
# compute evaluation metrics
def construct_pred_mask(tags_predicted, predictat):
n_samples, n_tags = tags_predicted.shape
rankings = np.argsort(-tags_predicted, axis=1)[:, :predictat]
tags_predicted_binary = np.zeros_like(tags_predicted, dtype=bool)
for i in xrange(n_samples):
tags_predicted_binary[i, rankings[i]] = 1
return tags_predicted_binary
def per_tag_prec_recall(tags_predicted_binary, tags_true_binary):
mask = np.logical_and(tags_predicted_binary, tags_true_binary)
prec = mask.sum(axis=0) / (tags_predicted_binary.sum(axis=0) + np.spacing(1))
tags_true_count = tags_true_binary.sum(axis=0).astype(float)
idx = (tags_true_count > 0)
recall = mask.sum(axis=0)[idx] / tags_true_count[idx]
return prec, recall
def aroc_ap(tags_true_binary, tags_predicted):
n_tags = tags_true_binary.shape[1]
auc = list()
aprec = list()
for i in xrange(n_tags):
if np.sum(tags_true_binary[:, i]) != 0:
auc.append(roc_auc_score(tags_true_binary[:, i], tags_predicted[:, i]))
aprec.append(average_precision_score(tags_true_binary[:, i], tags_predicted[:, i]))
return auc, aprec
def print_out_metrics(tags_true_binary, tags_predicted, predictat):
tags_predicted_binary = construct_pred_mask(tags_predicted, predictat)
prec, recall = per_tag_prec_recall(tags_predicted_binary, tags_true_binary)
mprec, mrecall = np.mean(prec), np.mean(recall)
print 'Precision = %.3f (%.3f)' % (mprec, np.std(prec) / sqrt(prec.size))
print 'Recall = %.3f (%.3f)' % (mrecall, np.std(recall) / sqrt(recall.size))
print 'F-score = %.3f' % (2 * mprec * mrecall / (mprec + mrecall))
auc, aprec = aroc_ap(tags_true_binary, tags_predicted)
print 'AROC = %.3f (%.3f)' % (np.mean(auc), np.std(auc) / sqrt(len(auc)))
print 'AP = %.3f (%.3f)' % (np.mean(aprec), np.std(aprec) / sqrt(len(aprec)))
In [27]:
tags_predicted = pred_test
print tags_predicted.min(), tags_predicted.max()
div_factor = 1.25
tags_predicted = tags_predicted - div_factor * np.mean(tags_predicted, axis=0)
In [26]:
# results with dropout = 0.5 (50 epochs)
predictat = 20
tags_true_binary = (y_test > 0)
print_out_metrics(tags_true_binary, tags_predicted, predictat)
In [21]:
# results with dropout = 0.25 (50 epochs)
predictat = 20
tags_true_binary = (y_test > 0)
print_out_metrics(tags_true_binary, tags_predicted, predictat)
In [29]:
# results with dropout = 0.25 (100 epochs)
predictat = 20
tags_true_binary = (y_test > 0)
print_out_metrics(tags_true_binary, tags_predicted, predictat)
In [ ]: