Author: Justin Tan
RNN model for rare decay identification in TensorFlow. Bidirectional.
June update: Multi-GPU support
In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import time, os
class config(object):
# Set network parameters
# Empirically, depth more important than layer size - output dimension
mode = 'kst'
channel = 'rho0'
n_particles = 5
n_features = 100
seq_length = n_features/n_particles
rnn_cell = 'gru'#'lru_cell' # 'gru'
hidden_units = 256 # Number of neurons per RNN Cell
keep_prob = 1.0
input_keep_prob = 0.9
recurrent_keep_prob = 0.9
num_epochs = 64
batch_size = 512
n_layers = 3 # Note: 3 layers is considered 'deep'
learning_rate = 1e-3
lr_epoch_decay = 0.999
ema_decay = 0.999
n_classes = 2
n_gpus = 4
class directories(object):
data = 'data'
tensorboard = 'tensorboard'
checkpoints = 'checkpoints'
samples = 'samples'
architecture = '{} - {} | Base cell: {} | Hidden units: {} | Layers: {} | Batch: {} | Epochs: {}'.format(
config.channel, config.mode, config.rnn_cell, config.hidden_units, config.n_layers, config.batch_size, config.num_epochs)
class reader():
def __init__(self, df):
self.df = df
self.batch_size = config.batch_size
self.steps_per_epoch = len(df) // config.batch_size
self.epochs = 0
self.proceed = True
self.shuffle()
def shuffle(self):
self.df = self.df.sample(frac=1).reset_index(drop=True)
self.df_X = self.df.drop('Labels', axis = 1)
self.df_y = self.df['Labels']
self.pointer = 0
def next_batch(self, batch_size):
if self.pointer + 1 >= self.steps_per_epoch:
inputs = self.df_X.iloc[self.pointer*batch_size:]
targets = self.df_y.iloc[self.pointer*batch_size:]
self.epochs += 1
self.shuffle()
self.proceed = False
inputs = self.df_X.iloc[self.pointer*batch_size:(self.pointer+1)*batch_size]
targets = self.df_y.iloc[self.pointer*batch_size:(self.pointer+1)*batch_size]
self.pointer += 1
return inputs, targets
def save_summary(config, delta_t, train_acc, test_acc):
import json
summary = {
'Timestamp': time.strftime('%c'),
'Base cell': config.rnn_cell,
'Hidden units': config.hidden_units,
'Layers': config.n_layers,
'Batch_size': config.batch_size,
'Seq_length': config.seq_length,
'Dropout': config.keep_prob,
'Epochs': config.num_epochs,
'Time': delta_t,
'Final train acc': train_acc,
'Final test acc': test_acc
}
# Writing JSON data
if os.path.isfile('rnn_summary.json'):
with open('rnn_summary_{}.json.format(config.name)', 'r+') as f:
new = json.load(f)
new.append(summary)
with open('rnn_summary.json', 'w') as f:
json.dump(new, f, indent = 4)
else:
with open('rnn_summary.json', 'w') as f:
json.dump([summary], f, indent = 4)
def p_ordering(df):
# Drop errors, order particles by momentum
df = df.drop([column for column in df.columns if column.endswith('Err')], axis = 1)
labels = df['Labels']
blocks = np.split(df.drop('Labels', axis = 1), config.n_particles, axis = 1)
cols_p = [column for column in df.columns if column.endswith('cms_p')]
p_mean = [df[column].mean() for column in cols_p]
p_ordered_frames = [blocks[i] for i in np.argsort(p_mean)]
p_ordered_frames.append(labels)
df_p_ordered = pd.concat(p_ordered_frames, axis = 1)
return df_p_ordered
def load_data(file_name, test_size = 0.05):
from sklearn.model_selection import train_test_split
df = pd.read_hdf(file_name, 'df')
df = p_ordering(df)
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df.drop('Labels', axis = 1),
df['Labels'], test_size = test_size, random_state=42)
return df_X_train, df_X_test, df_y_train, df_y_test
def plot_ROC_curve(network_output, y_true, meta = ''):
# import matplotlib as mpl
# mpl.use('pgf')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
y_score = network_output[:,1]
# Compute ROC curve, integrate
fpr, tpr, thresholds = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.axes([.1,.1,.8,.7])
plt.figtext(.5,.9, r'$\mathrm{Receiver \;Operating \;Characteristic}$', fontsize=15, ha='center')
plt.figtext(.5,.85, meta, fontsize=10,ha='center')
plt.plot(fpr, tpr, color='darkorange',
lw=2, label='ROC (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=1.0, linestyle='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel(r'$\mathrm{False \;Positive \;Rate}$')
plt.ylabel(r'$\mathrm{True \;Positive \;Rate}$')
plt.legend(loc="lower right")
plt.savefig(os.path.join('graphs', '{}_{}_ROC.pdf'.format(config.channel, config.mode)), format='pdf', dpi=1000)
#plt.savefig(os.path.join('graphs', '{}_{}_ROC.pgf'.format(config.channel, config.mode)), format='pgf', dpi=1000)
print('AUC: {:.4f}'.format(roc_auc))
plt.show()
plt.gcf().clear()
In [2]:
test_file = '/data/projects/punim0011/jtan/data/rnn/rnn_B02rho0gamma_kst.h5'
assert config.batch_size % config.n_gpus == 0, 'Batch size must be divisible by number of GPUs'
df_X_train, df_X_test, df_y_train, df_y_test = load_data(test_file)
df_train = pd.concat([df_X_train, df_y_train], axis = 1)
df_test = pd.concat([df_X_test, df_y_test], axis = 1)
config.n_features = df_train.shape[1] - 1
config.seq_length = config.n_features//config.n_particles
config.steps_per_epoch = len(df_X_train) // config.batch_size
assert config.seq_length == config.n_features/config.n_particles, 'Discrepancy in input feature dimension'
readerTrain = reader(df_train)
readerTest = reader(df_test)
In [3]:
def layer_weights(shape, name = 'weights'):
# Return weight tensor of given shape using Xavier initialization
W = tf.get_variable(name, shape = shape, initializer=tf.contrib.layers.xavier_initializer())
return W
def layer_biases(shape, name = 'biases'):
# Return bias tensor of given shape with small initialized constant value
b = tf.get_variable(name, shape = shape, initializer = tf.constant_initializer(0.01))
return b
def BN_layer_ops(x, shape, name, keep_prob, phase, activation=tf.nn.relu):
# High-level implementation of BN
with tf.variable_scope(name) as scope:
# scope.reuse_variables() # otherwise tf.get_variable() checks that already existing vars are not shared by accident
weights = layer_weights(shape = shape)
biases = layer_biases(shape = [shape[1]])
z_BN = tf.matmul(x, weights) + biases
# Place BN transform before non-linearity - update to TF 1.2!
theta_BN = tf.contrib.layers.batch_norm(z_BN, center=True, scale=True,is_training=phase,
decay=0.99, zero_debias_moving_mean=True, scope='bn', fused = True)
BN_actv = activation(theta_BN)
BN_layer_output = tf.nn.dropout(BN_actv, keep_prob)
return BN_layer_output
def build_network(x, n_layers, hidden_layer_nodes, keep_prob, training_phase):
assert n_layers == len(hidden_layer_nodes), 'Specified layer nodes and number of layers do not correspond.'
layers = [x]
with tf.variable_scope('BN_layers') as scope:
hidden_1 = BN_layer_ops(x, shape = [config.n_features, hidden_layer_nodes[0]], name = 'BNhidden0',
keep_prob = keep_prob, phase = training_phase)
layers.append(hidden_1)
for n in range(0,n_layers-1):
hidden_n = BN_layer_ops(layers[-1], shape = [hidden_layer_nodes[n], hidden_layer_nodes[n+1]], name = 'BNhidden{}'.format(n+1),
keep_prob = keep_prob, phase = training_phase)
layers.append(hidden_n)
readout = readout_ops(layers[-1], shape = [hidden_layer_nodes[-1], config.n_classes], name = 'readout')
return readout
def average_gradients(tower_grads):
""" Calculate the average gradient for each shared variable across all towers.
Args:
tower_grads: Nested list of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been averaged
across all towers.
"""
average_grads = []
for grad_var_pair in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_var_pair:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So just return the first tower's pointer to
# the Variable.
v = grad_var_pair[0][1]
gv_pair = (grad, v)
average_grads.append(gv_pair)
return average_grads
In [4]:
class BiRNN():
def __init__(self, config, training = True):
# Placeholders for feed_dict
self.inputs = tf.placeholder(tf.float32, shape = [None, config.n_features])
self.targets = tf.placeholder(tf.int32, shape = [None])
self.keep_prob = tf.placeholder(tf.float32) # Dropout on input connections
self.global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
beta = tf.train.exponential_decay(config.learning_rate, self.global_step,
decay_steps = config.steps_per_epoch, decay_rate = config.lr_epoch_decay, staircase=True)
# Reshape input to batch_size x n_particles x seq_length tensor, split batches
# evenly across gpus
rnn_inputs = tf.reshape(self.inputs, [-1, config.n_particles, config.seq_length])
rnn_input_batches = tf.split(rnn_inputs, config.n_gpus, axis = 0)
label_batches = tf.split(self.targets, config.n_gpus, axis = 0)
opt = tf.train.AdamOptimizer(beta)
# Choose rnn cell type
if config.rnn_cell == 'lstm':
args = {'num_units': config.hidden_units, 'forget_bias': 1.0, 'state_is_tuple': True}
base_cell = tf.nn.rnn_cell.LSTMCell
elif config.rnn_cell == 'gru':
args = {'num_units': config.hidden_units}
base_cell = tf.nn.rnn_cell.GRUCell
elif config.rnn_cell == 'layer-norm':
args = {'num_units': config.hidden_units, 'forget_bias': 1.0, 'dropout_keep_prob': config.recurrent_keep_prob}
base_cell = tf.contrib.rnn.LayerNormBasicLSTMCell
else:
args = {'num_units': config.hidden_units, 'forget_bias': 1.0, 'dropout_keep_prob': config.recurrent_keep_prob}
base_cell = tf.contrib.rnn.LayerNormBasicLSTMCell
self.cell = base_cell
def tower_computation(scope, inputs, labels, n_gpu):
if training and config.input_keep_prob < 1:
rnn_inputs = tf.nn.dropout(inputs, self.keep_prob)
fwd_cells = [tf.nn.rnn_cell.DropoutWrapper(
self.cell(**args), input_keep_prob = config.input_keep_prob) for _ in range(config.n_layers)]
bwd_cells = [tf.nn.rnn_cell.DropoutWrapper(
self.cell(**args), input_keep_prob = config.input_keep_prob) for _ in range(config.n_layers)]
else:
fwd_cells = [self.cell(**args) for _ in range(config.n_layers)]
bwd_cells = [self.cell(**args) for _ in range(config.n_layers)]
fwd_init = [fwd_cell.zero_state(config.batch_size/config.n_gpus, tf.float32) for fwd_cell in fwd_cells]
bwd_init = [bwd_cell.zero_state(config.batch_size/config.n_gpus, tf.float32) for bwd_cell in bwd_cells]
birnn_output, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
cells_fw = fwd_cells,
cells_bw = bwd_cells,
inputs = rnn_inputs,
initial_states_fw = fwd_init,
initial_states_bw = bwd_init,
sequence_length = np.ones(config.batch_size//config.n_gpus)*config.n_particles,
parallel_iterations = 64)
# Extract output from last time step
outputs = tf.split(birnn_output, 2, axis = 2)
output_fwd = outputs[0][:,-1,:]
output_bwd = outputs[1][:,-1,:]
with tf.variable_scope('softmax'):
W_f = layer_weights(shape = [config.hidden_units, config.n_classes], name = 'smx_W_fwd')
W_b = layer_weights(shape = [config.hidden_units, config.n_classes], name = 'smx_W_bwd')
softmax_b = layer_biases(shape = [config.n_classes], name = 'smx_b')
logits_RNN = tf.matmul(output_fwd, W_f) + tf.matmul(output_bwd, W_b) + softmax_b # Unormalized log probabilties for next char
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits_RNN, labels = labels))
tf.add_to_collection('losses_collection', cross_entropy)
# Assemble all of the losses for the current tower only.
losses = tf.get_collection('losses_collection', scope)
for l in losses:
tf.summary.scalar('xentropy_{}-raw'.format(n_gpu), l)
return cross_entropy, logits_RNN
# Calculate gradients for each model tower
tower_grads, tower_readouts, tower_losses, tower_summaries = [], [], [], []
for gpu in range(config.n_gpus):
with tf.device('/gpu:{}'.format(gpu)):
with tf.variable_scope('vDNN', reuse=(gpu > 0)):
with tf.name_scope('tower_{}'.format(gpu)) as scope:
# Load one batch per GPU
input_batch, label_batch = rnn_input_batches[gpu], label_batches[gpu]
# Calculate loss for one tower of the model. Construct the entire model,
# but share the variable across all towers
loss, readout = tower_computation(scope, input_batch, label_batch, gpu)
# Reuse variables for the next tower, retain the summaries from the final tower.
#tf.get_variable_scope().reuse_variables()
summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
# Retain batch norm update operations only from the final tower.
# batchnorm_updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope)
# Calculate the gradients for given batch on this tower
grads = opt.compute_gradients(loss)
tower_grads.append(grads)
tower_readouts.append(readout)
tower_summaries.append(summaries)
tower_losses.append(loss)
# Synchronize all towers
mean_grads = average_gradients(tower_grads)
self.readout = tf.concat(tower_readouts, axis = 0)
# Evaluation metrics
self.cross_entropy = tf.reduce_mean(tower_losses)
self.prediction = tf.nn.softmax(self.readout)
correct_prediction = tf.equal(tf.cast(tf.argmax(self.readout, 1), tf.int32), self.targets)
self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
_, self.auc_op = tf.metrics.auc(predictions = tf.argmax(self.readout,1), labels = self.targets, num_thresholds = 512)
# Track moving average of trainable variables
self.ema = tf.train.ExponentialMovingAverage(decay = config.ema_decay, num_updates = self.global_step)
maintain_averages_op = self.ema.apply(tf.trainable_variables())
# Apply the gradients to adjust the shared variables.
apply_gradient_op = opt.apply_gradients(mean_grads, global_step=self.global_step)
# Group all updates to into a single train op.
#batchnorm_updates_op = tf.group(*batchnorm_updates)
self.train_op = tf.group(apply_gradient_op, maintain_averages_op)#, batchnorm_updates_op)
saver = tf.train.Saver(tf.global_variables())
# Build the summary operation from the last tower summaries
tower_summaries.append(tf.summary.scalar('cross_entropy', self.cross_entropy))
tower_summaries.append(tf.summary.scalar('accuracy', self.accuracy))
tower_summaries.append(tf.summary.scalar('auc', self.auc_op))
tower_summaries.append(tf.summary.scalar('global_step', self.global_step))
tower_summaries.append(tf.summary.scalar('learning_rate', beta))
self.merge_op = tf.summary.merge(tower_summaries)
def predict(self, ckpt, metaGraph = None):
pin_cpu = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True, device_count = {'GPU':0})
# Restore the moving average version of the learned variables for eval.
#variable_averages = tf.train.ExponentialMovingAverage(config.ema_decay)
variables_to_restore = self.ema.variables_to_restore()
#variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
with tf.Session(config=pin_cpu) as sess:
# Initialize variables
init_op = tf.global_variables_initializer()
sess.run(init_op)
sess.run(tf.local_variables_initializer())
start_time = time.time()
assert (ckpt.model_checkpoint_path or metaGraph), 'Missing checkpoint file!'
if metaGraph:
saver = tf.train.import_meta_graph(metaGraph)
saver.restore(sess, os.path.splitext(metaGraph)[0])
print('{} restored.'.format(metaGraph))
else:
saver.restore(sess, ckpt.model_checkpoint_path)
print('{} restored.'.format(ckpt.model_checkpoint_path))
# Make predictions using the trained model
feed_dict_test = {self.inputs: df_X_test.values, self.targets: df_y_test.values, self.keep_prob: 1.0}#, self.training_phase: False}
network_output_test, final_v_acc, final_v_auc = sess.run(
[self.prediction, self.accuracy, self.auc_op], feed_dict = feed_dict_test)
print("Validation accuracy: {:g}\nValidation AUC: {:g}".format(final_v_acc, final_v_auc))
plot_ROC_curve(network_output = network_output_test, y_true = df_y_test.values,
meta = architecture + ' | Test accuracy: {}'.format(final_v_acc))
delta_t = time.time() - start_time
print("Inference complete. Duration: %g s" %(delta_t))
return network_output_test
In [5]:
def train(config, restore = False):
biRNN = BiRNN(config, training = True)
start_time = time.time()
v_acc_best = 0.
global_step = 0
global_epoch = 0
saver = tf.train.Saver()
train_writer = tf.summary.FileWriter(
os.path.join(directories.tensorboard, 'train_{}'.format(time.strftime('%d-%m_%I:%M'))), graph = tf.get_default_graph())
test_writer = tf.summary.FileWriter(os.path.join(directories.tensorboard, 'test_{}'.format(time.strftime('%d-%m_%I:%M'))))
ckpt = tf.train.get_checkpoint_state(directories.checkpoints)
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:
# Initialize variables
init_op = tf.global_variables_initializer()
sess.run(init_op)
if restore and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
global_epoch = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
assert (type(global_epoch) == int), 'Epoch number untracked'
print('{} restored at epoch {}.'.format(ckpt.model_checkpoint_path, global_epoch))
for epoch in range(global_epoch,config.num_epochs):
readerTrain.proceed = True
step = 0
# Save every 8 epochs
if epoch % 8 == 0:
save_path = saver.save(sess,
os.path.join(directories.checkpoints,'biRNN_{}_{}_epoch{}.ckpt'.format(config.mode, config.channel, epoch)),
global_step = epoch)
print('Graph saved to file: {}'.format(save_path))
print('(*) Entering Epoch {} ({:.3f} s)'.format(epoch, time.time() - start_time))
while(readerTrain.proceed):
# Iterate through entire corpus
x_train, y_train = readerTrain.next_batch(config.batch_size)
feed_dict_train = {biRNN.inputs: x_train.values, biRNN.targets: y_train.values, biRNN.keep_prob: config.keep_prob}
t_op = sess.run(biRNN.train_op, feed_dict = feed_dict_train)
step += 1
if step % (config.steps_per_epoch // 8) == 0:
# Evaluate model
improved = ''
sess.run(tf.local_variables_initializer())
x_test, y_test = readerTest.next_batch(config.batch_size)
feed_dict_test = {biRNN.inputs: x_test.values, biRNN.targets: y_test.values, biRNN.keep_prob: 1.0}
t_acc, t_summary = sess.run([biRNN.accuracy, biRNN.merge_op],
feed_dict = feed_dict_train)
v_acc, v_loss, v_auc, v_summary, = sess.run([biRNN.accuracy, biRNN.cross_entropy, biRNN.auc_op, biRNN.merge_op],
feed_dict = feed_dict_test)
train_writer.add_summary(t_summary, step)
test_writer.add_summary(v_summary, step)
if epoch > 8 and v_acc > v_acc_best:
v_acc_best = v_acc
improved = '*'
save_path = saver.save(sess, os.path.join(directories.checkpoints, 'best.ckpt'), global_step = epoch)
print('Epoch {}, Step {} | Training Acc: {:.3f} | Test Acc: {:.3f} | Test Loss: {:.3f} | Test AUC {:.3f} ({:.2f} s) {}'
.format(epoch, step, t_acc, v_acc, v_loss, v_auc, time.time() - start_time, improved))
save_path = saver.save(sess, os.path.join(directories.checkpoints, 'biRNN_end'),
global_step = epoch)
print('Metagraph aved to file: {}'.format(save_path))
print('Architecture: {}'.format(architecture))
# final_train_accuracy = biRNN.accuracy.eval(feed_dict = {biRNN.inputs: df_X_train.values,
# biRNN.targets: df_y_train.values, biRNN.keep_prob: 1.0})
# final_test_accuracy = biRNN.accuracy.eval(feed_dict = {biRNN.inputs: df_X_test.values,
# biRNN.targets: df_y_test.values, biRNN.keep_prob: 1.0})
# delta_t = time.time() - start_time
# print("Training Complete. Time elapsed: {:.3f} s".format(delta_t))
# print("Train accuracy: %g\nValidation accuracy: %g" %(final_train_accuracy, final_test_accuracy))
# save_summary(config, delta_t, final_train_accuracy, final_test_accuracy)
In [6]:
train(config)#, restore = True)
In [4]:
def cell_dropout(base_cell, keep_prob):
# Apply dropout between RNN layers - only on the output
cell_dropout = tf.contrib.rnn.DropoutWrapper(base_cell, output_keep_prob=keep_prob)
return cell_dropout
def layer_weights(shape, name = 'weights'):
# Return weight tensor of given shape using Xavier initialization
W = tf.get_variable(name, shape = shape, initializer=tf.contrib.layers.xavier_initializer())
return W
def layer_biases(shape, name = 'biases'):
# Return bias tensor of given shape with small initialized constant value
b = tf.get_variable(name, shape = shape, initializer = tf.constant_initializer(0.01))
return b
class CharRNN():
def __init__(self, config, training = True, sample = False):
self.config = config
self.scope = 'train'
if sample:
# Configure graph to generate characters
self.config.batch_size = 1
self.config.seq_length = 1
# Placeholders for feed_dict
self.inputs = tf.placeholder(tf.int32, shape = [None, self.config.seq_length])
self.targets = tf.placeholder(tf.int32, shape = [None, self.config.seq_length])
self.keep_prob = tf.placeholder(tf.float32) # Dropout on input connections
# Initialize embedding matrix to be uniform in the unit cube
# embeds char IDs into dense representation - the RNN state size
embeddings = tf.Variable(
tf.random_uniform([config.vocab_size, config.hidden_units], -1.0, 1.0))
rnn_inputs = tf.nn.embedding_lookup(embeddings, self.inputs)
# Place operations necessary to perform inference onto graph
if config.rnn_cell == 'lstm':
base_cell = tf.contrib.rnn.LSTMCell(num_units = config.hidden_units, forget_bias = 1.0, state_is_tuple = True)
elif config.rnn_cell == 'gru':
base_cell = tf.contrib.rnn.GRUCell(num_units = config.hidden_units)
elif config.rnn_cell == 'layer-norm':
base_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(num_units = config.hidden_units,
forget_bias = 1.0, dropout_keep_prob = config.recurrent_keep_prob)
else:
base_cell = tf.contrib.rnn.BasicRNNCell(num_units = config.hidden_units)
self.cell = base_cell
# Apply Dropout operator on non-recurrent connections
if training and config.input_keep_prob < 1:
rnn_inputs = tf.nn.dropout(rnn_inputs, config.keep_prob)
self.cell = tf.contrib.rnn.DropoutWrapper(base_cell, input_keep_prob=config.input_keep_prob)
# Wrap stacked cells into a single cell
self.multicell = tf.contrib.rnn.MultiRNNCell(
[self.cell for _ in range(config.n_layers)], state_is_tuple=True)
# Accept previous hidden state as input
self.zero_state = self.multicell.zero_state(self.config.batch_size, tf.float32)
self.init_state = self.zero_state
# Outputs shaped [batch_size, max_time, cell.output_size]
rnn_outputs, self.final_state = tf.nn.dynamic_rnn(
cell = self.multicell, inputs = rnn_inputs, initial_state = self.init_state, scope = self.scope)
# Flatten outputs across batch_size, sequence length dimensions
flat_rnn_outputs = tf.reshape(rnn_outputs, [-1, config.hidden_units])
flat_targets = tf.reshape(self.targets, [-1])
with tf.variable_scope('softmax_{}'.format(self.scope)):
softmax_W = layer_weights(shape = [config.hidden_units, config.vocab_size], name = 'smx_W')
softmax_b = layer_biases(shape = [config.vocab_size], name = 'smx_b')
self.logits_RNN = tf.matmul(flat_rnn_outputs, softmax_W) + softmax_b # Unormalized log probabilties for next char
self.predictions = tf.nn.softmax(self.logits_RNN)
self.cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.logits_RNN, labels = flat_targets))
tf.summary.scalar('cross_entropy', self.cross_entropy)
# Anneal learning rate
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(config.learning_rate, global_step,
decay_steps = reader.steps_per_epoch, decay_rate = config.lr_epoch_decay, staircase=True)
self.train_op = tf.train.AdamOptimizer(config.learning_rate).minimize(self.cross_entropy, name = 'optimizer',
global_step = global_step)
def sample(self, ckpt, char2ix, seed = 'The ', sample_length = 1000, simple = True, use_temperature = False, temperature = 5,test = False):
''' Samples a sequence of characters from a saved model with given seed
'''
chars = []
ix2char = dict(zip(char2ix.values(), char2ix.keys()))
with tf.Session() as sess:
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
sess.run(init_op)
if ckpt.model_checkpoint_path:
# saver = tf.train.import_meta_graph('checkpoints/fields/char-RNN_fields_epoch49.ckpt-49.meta')
# saver.restore(sess, 'checkpoints/fields/char-RNN_fields_epoch49.ckpt-49')
saver.restore(sess, ckpt.model_checkpoint_path)
print('{} restored.'.format(ckpt.model_checkpoint_path))
# Begin confused, condition upon given seed
state = sess.run(self.zero_state)
for char in seed:
ix = char2ix[char]
feed_dict_sample = {self.inputs: np.array([[ix]]), self.init_state: state}
state = sess.run(self.final_state, feed_dict = feed_dict_sample)
chars.append(ix)
current_char = chars[-1]
def weighted_pick(weights):
t = np.cumsum(weights)
s = np.sum(weights)
return(int(np.searchsorted(t, np.random.rand(1)*s)))
# Get predictions
from scipy.misc import logsumexp
def log_softmax(vec):
return vec - logsumexp(vec)
def softmax(vec):
return np.exp(log_softmax(vec))
for n in range(sample_length):
feed_dict_sample = {self.inputs: np.array([[current_char]]), self.init_state: state}
self.logits, preds, state = sess.run([self.logits_RNN, self.predictions, self.final_state], feed_dict = feed_dict_sample)
if use_temperature:
logits = np.squeeze(self.logits)
logits = np.asarray(logits, np.float64)
logits /= temperature
x = logits - np.max(logits)
#boltzmann_factor = np.exp(x)
#preds = boltzmann_factor/np.sum(boltzmann_factor)
preds = np.exp(x)/np.sum(np.exp(x))
if simple:
current_char = np.random.choice(config.vocab_size, 1, p = np.squeeze(preds))[0]
else:
dist = np.random.multinomial(n = 100, pvals = np.squeeze(preds))
current_char = np.argmax(dist)
if test:
current_char = weighted_pick(preds[0])
chars.append(current_char)
chars = [ix2char[ix] for ix in chars]
sample = ''.join(chars)
print(sample)
with open(os.path.join(directories.samples, 'sample_{}'.format(config.name)), 'w') as f:
json.dump(sample, f)
def train(config, restore = False):
charRNN = CharRNN(config, training = True)
saver = tf.train.Saver()
merge_op = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(
os.path.join(directories.tensorboard, 'train_{}'.format(time.strftime('%d-%m_%I:%M'))), graph = tf.get_default_graph())
test_writer = tf.summary.FileWriter(os.path.join(directories.tensorboard, 'test_{}'.format(time.strftime('%d-%m_%I:%M'))))
ckpt = tf.train.get_checkpoint_state(directories.checkpoints)
with tf.Session() as sess:
# Initialize variables
init_op = tf.global_variables_initializer()
sess.run(init_op)
if restore and ckpt.model_checkpoint_path:
print('{} restored.'.format(ckpt.model_checkpoint_path))
# saver = tf.train.import_meta_graph('checkpoints/char-RNN__epoch49.ckpt-49.meta')
# saver.restore(sess, 'checkpoints/char-RNN__epoch49.ckpt-49')
saver.restore(sess, ckpt.model_checkpoint_path)
start_time = time.time()
for epoch in range(config.num_epochs):
# Reset RNN memory
state = sess.run(charRNN.zero_state)
reader.proceed = True
step = 0
total_loss = 0.0
epoch_mean_loss = []
# Save every epoch
save_path = saver.save(sess,
os.path.join(directories.checkpoints,'char-RNN_{}_epoch{}.ckpt'.format(config.name, epoch)),
global_step = epoch)
print('(*) Entering Epoch {} ({:.3f} s)'.format(epoch, time.time() - start_time))
print('Metagraph saved to file: {}'.format(save_path))
while(reader.proceed):
# Iterate through entire corpus
batch_inputs, batch_targets = reader.next_batch(config.batch_size, config.seq_length)
feed_dict_train = {charRNN.inputs: batch_inputs, charRNN.targets: batch_targets, charRNN.init_state: state}
t_loss, state, t_op = sess.run([charRNN.cross_entropy, charRNN.final_state, charRNN.train_op], feed_dict = feed_dict_train)
step += 1
if step % (reader.steps_per_epoch // 4) == 0:
# Evaluate model
val_inputs, val_targets = reader.next_batch(config.batch_size, config.seq_length)
feed_dict_val = {charRNN.inputs: val_inputs, charRNN.targets: val_targets, charRNN.init_state: state}
train_summary = sess.run(merge_op, feed_dict = feed_dict_train)
v_loss, v_summary, = sess.run([charRNN.cross_entropy, merge_op], feed_dict = feed_dict_val)
train_writer.add_summary(train_summary, step)
test_writer.add_summary(v_summary, step)
print('Epoch {}, Step {} | Training Loss (mean): {:.3f} ({:.3f}) | Validation Loss {:.3f}'
.format(epoch, step, t_loss, total_loss/step, v_loss))
total_loss += t_loss
epoch_mean_loss.append(total_loss/step)
save_path = saver.save(sess, os.path.join(directories.checkpoints, 'char-RNN_end'),
global_step = epoch)
print('Metagraph saved to file: {}'.format(save_path))
delta_t = time.time() - start_time
print("Training Complete. Time elapsed: %g s\n" %(delta_t))
print("Average train accuracy on final epoch: {:.3f}".format(epoch_mean_loss[-1]))
print('Architecture: {}\n'.format(architecture))
save_summary(config, delta_t, epoch_mean_loss[-1])
In [ ]:
train(config)#, restore = True)
In [5]:
with open(os.path.join(directories.checkpoints, 'char2ix_{}.json'.format(os.path.splitext(os.path.basename(file_name))[0])), 'r') as f:
char2ix = json.load(f)
char2ix = {key:int(char2ix[key]) for key in char2ix}
assert char2ix == reader.char2ix, 'Discrepancy in char-index map!'
ckpt = tf.train.get_checkpoint_state(directories.checkpoints)
model = CharRNN(config, training = False, sample = True)
In [15]:
model.sample(ckpt, char2ix = reader.char2ix, seed = 'The ', sample_length = 3000, simple = False, use_temperature = True, temperature = 2.3)