In [12]:
import tensorflow as tf
import numpy as np
import math
import matplotlib.pyplot as plt
from preprocessing import directory_to_data_files, sample_data
import time
data_filenames = ['./Data/data_19x19.csv']
board_size = 19
D = board_size*board_size
n_batches = 100000 # Number of batches
batch_size = 128 # Number of training examples to randomly sample from data file (note: repeated sampling could give repeat examples)
empty_board_string = "0 " * (D - 1) + "0"
# Define the placeholders that will persist across all versions of the network
# For the CNN we want the board in matrix form as input
# X = tf.placeholder(tf.float32, [None, board_size, board_size, 1])
# The below placeholder is used when we have the [19x19x2] input type
X = tf.placeholder(tf.float32, [None, board_size, board_size, 2])
Y_ = tf.placeholder(tf.float32, [None, D])
# Add a variable learning rate to the model in case we want some scheduling
learning_rate = tf.placeholder(tf.float32)
# Dropout probability, modifiable just in case, p that we keep a node
dropout = tf.placeholder(tf.float32)
In [13]:
def read_my_csv(filename_queue):
# Set up the reader
reader = tf.TextLineReader()
# Grab the values from the file(s)
key, value = reader.read(filename_queue)
# Perform the decoding
default_values = [["0"],[empty_board_string],[empty_board_string]]
col1, col2, col3 = tf.decode_csv(value, record_defaults=default_values)
# Perform preporcessing here
split_col2 = tf.string_split(tf.expand_dims(col2, axis=0), delimiter=" ")
features = tf.reshape(tf.string_to_number(split_col2.values, out_type=tf.float32),[D])
split_col3 = tf.string_split(tf.expand_dims(col3, axis=0), delimiter=" ")
labels = tf.reshape(tf.string_to_number(split_col3.values, out_type=tf.float32),[D])
return features, labels
In [14]:
def input_pipeline(filenames, batch_size):
filename_queue = tf.train.string_input_producer(filenames, shuffle=True)
example, label = read_my_csv(filename_queue)
min_after_dequeue = 100
capacity = min_after_dequeue + 3 * batch_size
# Create the batches using shuffle_batch which performs random shuffling
example_batch, label_batch = tf.train.shuffle_batch([example, label],
batch_size=batch_size,
capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
In [15]:
example_batch, label_batch = input_pipeline(data_filenames, batch_size)
test_batch, test_label_batch = input_pipeline(data_filenames, 10000)
In this section we will set up the variables to be used, and then define a structure built around them. As input, the CNN will expect a matrix representation of the game board.
Weights are initialized to some random variables with a normal distribution around zero. Truncated normal is used so that values are within 2 standard deviations from zero.
Biases are initialized to 0.1 for now. If any other approaches surface, we can easily change.
For each of the series of weights, an explanation of the format is explained in the comments of the code.
In [16]:
# Output depths of our convolutional layers
# This means we have a [19 x 19 x Depth] box as output (with stride of 1)
L1 = 8
L2 = 16
L3 = 32
# Set our number of neurons in the fully connected layer
neurons = 2000
######################
# Weights and biases #
######################
# Convolutional layers
# For the weight matrix [x, x, y, z], we have filters of size x by x with depth of y
# The output of the layer will have a new depth of z, as we have z of these filters
# W1 = tf.Variable(tf.truncated_normal([7, 7, 1, L1], stddev=0.01))
# The below weights are used when we have the [19x19x2] input type
W1 = tf.Variable(tf.truncated_normal([3, 3, 2, L1], stddev=0.01))
B1 = tf.Variable(tf.constant(0.1, tf.float32, [L1]))
W2 = tf.Variable(tf.truncated_normal([3, 3, L1, L2], stddev=0.01))
B2 = tf.Variable(tf.constant(0.1, tf.float32, [L2]))
W3 = tf.Variable(tf.truncated_normal([3, 3, L2, L3], stddev=0.01))
B3 = tf.Variable(tf.constant(0.1, tf.float32, [L3]))
# Fully Connected Layer
# We unwrap the input from the previous layer to build a 1-D vector
# Be careful here! In the format [x * x * L3, neurons], x is the size of the board after the third layer
# This means that x will change as our stride changes in the next section!
L3_size = 4
W4 = tf.Variable(tf.truncated_normal([L3_size * L3_size * L3, neurons], stddev=0.01))
B4 = tf.Variable(tf.constant(0.1, tf.float32, [neurons]))
# Output layer
# This is our layer of size D that tells us which move to use
W5 = tf.Variable(tf.truncated_normal([neurons, D], stddev=0.01))
B5 = tf.Variable(tf.constant(0.1, tf.float32, [D]))
Next we will define the graph of the network.
For the stride parameter, if stride=1 the dimensions of our board will not change. With a stride of 2, it will halve the size of the board.
For example, if we set stride=2 in the second layer we get ceil(previous layer/2), in this case a 10x10.
Each of the layers uses relu as the non-linear function. The first three layers use conv2d to pass the filter across the board. The padding parameter 'SAME' that is passed in adds zeroes to the outside of the board so that the filters can pass across the whole board without decreasing its size.
Dropout is only used in the fully connected layer. This is tunable (the probability of keeping a neuron)
Apparently it is best if we separate the 'logits' from the softmax output Y. This is done so that we can use the softmax_cross_entropy_with_logits function. This function allows us to avoid ending up taking the log(0), resulting in NaN.
The cross entropy is normalized by the batch size before being minimized.
In [17]:
# Now we construct the graph of the network
# It seems that stride of 1 keeps same size, stride of 2 halves the dimensions (round up)
stride = 1 # Output is 19x19
Y1 = tf.nn.relu(tf.nn.conv2d(X, W1, strides=[1, stride, stride, 1], padding='SAME') + B1)
stride = 2 # Output becomes 10x10 instead of 19x19 with stride of 2
Y2 = tf.nn.relu(tf.nn.conv2d(Y1, W2, strides=[1, stride, stride, 1], padding='VALID') + B2)
# Output is 5x5 with stride of 2
Y3 = tf.nn.relu(tf.nn.conv2d(Y2, W3, strides=[1, stride, stride, 1], padding='VALID') + B3)
# Now we want to reshape the output so that it is a 1-D vector
# The middle two ints are the size of the output, while L3 is the depth of the final layer
# These two integers are the size of the final layer Y3
YY3 = tf.reshape(Y3, shape=[-1, L3_size * L3_size * L3]) # unrolling
Y4 = tf.nn.relu(tf.matmul(YY3, W4) + B4)
YY4 = tf.nn.dropout(Y4, dropout)
Ylogits = tf.matmul(YY4, W5) + B5
Y = tf.nn.softmax(Ylogits)
# Do this on the logits so that we avoid taking log(0) and getting NaNs
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Y_)
# cross_entropy = -tf.reduce_sum(Y_ * tf.log(Y))
# Multiply this by our batch size to normalize
cross_entropy = tf.reduce_mean(cross_entropy)*batch_size
correct = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# Top 5 accuracy
labels = tf.argmax(Y_, 1)
top5 = tf.nn.in_top_k(Y, labels, 5)
top5_acc = tf.reduce_mean(tf.cast(top5, tf.float32))
# AdamOptimizer is what I keep seeing in example code and on stackoverflow,
# its the one we read about in our reading on SGA
train_step = tf.train.RMSPropOptimizer(learning_rate).minimize(cross_entropy)
# train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)
# train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)
In [19]:
train_ces = []
train_accs = []
test_accs = []
test_ces = []
test_top5_accs = []
with tf.Session() as sess:
start_time = time.time()
sess.run(tf.global_variables_initializer())
coordinator = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coordinator)
test_x_batch, test_y_batch = sess.run([test_batch, test_label_batch])
# Reshape from 1-D vectors to matrices
test_x_batch = [np.reshape(x, [board_size, board_size, 1]) for x in test_x_batch]
# The below stacks into 1s for self, 1s for other giving us [19x19x2] input
test_x_batch = [np.dstack((np.where(xx > 0, xx, 0), np.where(xx < 0, xx, 0)*-1)) for xx in test_x_batch]
test_data = {X: test_x_batch, Y_: test_y_batch, learning_rate:0.005, dropout:0.6}
for i in xrange(n_batches):
train_x_batch, train_y_batch = sess.run([example_batch, label_batch])
# Reshape from 1-D vectors to matrices
train_x_batch = [np.reshape(x, [board_size, board_size, 1]) for x in train_x_batch]
# The below stacks into 1s for self, 1s for other giving us [19x19x2] input
train_x_batch = [np.dstack((np.where(xx > 0, xx, 0), np.where(xx < 0, xx, 0)*-1)) for xx in train_x_batch]
train_data = {X: train_x_batch, Y_: train_y_batch, learning_rate:0.005, dropout:0.6}
sess.run(train_step, feed_dict=train_data)
if i==0:
print 'Test Test Test Train Train'
print 'Accuracy Top-5 Accuracy Cross_entropy Accuracy Cross_entropy Batch'
print '-------- -------------- ------------- -------- ------------- -----'
if i%10==0 or i==n_batches-1:
test_accuracy, test_top5_acc, test_cross_entropy = sess.run([accuracy, top5_acc, cross_entropy], feed_dict = test_data)
train_accuracy, train_cross_entropy = sess.run([accuracy, cross_entropy], feed_dict = train_data)
print '%.4f %.4f %.0f %.3f %.0f %d' % \
(test_accuracy, test_top5_acc, test_cross_entropy, train_accuracy, train_cross_entropy, i)
test_accs.append(test_accuracy)
test_ces.append(test_cross_entropy)
test_top5_accs.append(test_top5_acc)
train_accs.append(train_accuracy)
train_ces.append(train_cross_entropy)
coordinator.request_stop()
coordinator.join(threads)
sess.close()
In [ ]:
batches = [i*10 for i in xrange(len(test_acc))]
plt.title('CNN Model w/ Tanh')
plt.plot(batches, test_acc, label='Test Accuracy')
plt.plot(batches, test_top5, label='Top-5 Accuracy')
# plt.plot(batches, train_acc, label='Training Accuracy')
plt.legend(loc='best')
plt.xlabel('Batch')
plt.ylabel('Accuracy')
plt.show()
In [ ]: