In [ ]:
#
# Practice problem! yay
#
# Our mission, should we choose to accept it, is to see if we can use supervised machine learning to train
# a deep neural network to count the number of vowels in words it has never seen before, without ever telling
# neural network which letters are vowels!
#
# This is best done algorithmically with a few lines of code and no neural networks
# but that doesn't matter - it's a practice problem and anything goes :)
# Also, we only count a,e,i,o,u as vowels (sorry about that y, nothing personal).
#
# The data set consists of just over 10k words in random order. Each word has a maximum of 10 characters.
# The first row is the header, and there are five columns (the first is the row index):
#
# 1st column: - the row index
# 2nd column: a2i - the word, in english
# 3rd column: vowels - the number of vowels in the word
# 4th column: binary - the word, encoded into 70 binary digits, left-padded with 0, with 7 bits per character.
# For example, the word 'vital' is
# encoded as 0000000000000000000000000000000000011101101101001111010011000011101100
#
# Special thanks to the UCI Machine Learning Repository who provided the data set that this one was based on
# For similar datasets, please see https://archive.ics.uci.edu/ml/datasets/bag+of+words
# Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
In [ ]:
# This cell is used to generate the dataset
import pandas as pd
df = pd.read_csv('vocab.nips.txt')
df = df.sample(frac=1).reset_index(drop=True)
def countVowels(row):
s = row['a2i']
return s.count('a') + s.count('e') + s.count('i') + s.count('o') + s.count('u')
df['vowels'] = df.apply(countVowels, axis=1)
print(df.shape)
print(df.columns.values)
df = df[df.a2i.str.len() <= 10]
print(df.shape)
df['binary'] = df.apply(lambda row: ''.join(format(ord(x), 'b') for x in row[0]).rjust(70, '0'), axis=1)
df.to_csv('vocab.vowels.txt')
In [ ]:
# This cell loads the data set and splits it into numpy arrays of labels and examples,
# for both training and evalutation.
# The train_data, and train_labels are used for training the neural net. The train_words is just there for convenience.
import pandas as pd
import numpy as np
df = pd.read_csv('vocab.vowels.txt', names=['id', 'word', 'vowels', 'binary'], index_col='id', skiprows=1)
rows = df.shape[0]
train_rows = int(rows * 0.7)
print("Total rows: " + str(rows))
print("Training rows: " + str(train_rows))
df_train, df_eval = df.iloc[:train_rows, :], df.iloc[train_rows:, :]
train_list = []
eval_list = []
for i in df_train['binary']:
train_list.append([int(c) for c in i])
for i in df_eval['binary']:
eval_list.append([int(c) for c in i])
train_data = np.array(train_list).astype(np.float32)
eval_data = np.array(eval_list).astype(np.float32)
train_words = df_train.iloc[:,0].values
eval_words = df_eval.iloc[:,0].values
train_labels = df_train.iloc[:,1].values.astype(np.float32)
eval_labels = df_eval.iloc[:,1].values.astype(np.float32)
In [ ]:
# Let's just print some example data
i = 5432
print(train_words[i])
print(train_labels[i])
print(train_data[i])
In [ ]:
import tensorflow as tf
import numpy as np
tf.reset_default_graph()
tf.logging.set_verbosity(tf.logging.INFO)
idx = 0 # The layer index
dropout_rate = 0.5
model_dir = "results"
def normalize(mode, input):
return tf.layers.batch_normalization(input, training=True)
def convolution(mode, input_layer, filters, kernel_size, padding="VALID"):
global idx
idx = idx+1
print("Layer: conv" + str(idx))
return normalize(mode, tf.layers.separable_conv2d(
name="conv" + str(idx) + "_",
inputs=input_layer,
filters=filters,
kernel_size=kernel_size,
padding=padding,
activation=tf.nn.relu))
def pool(mode, input_layer, pool_size=[2,2], strides=[2,2]):
global idx
idx = idx+1
print("Layer: pool" + str(idx) + "_")
return tf.layers.max_pooling2d(inputs=input_layer, pool_size=pool_size, strides=strides, name="pool" + str(idx))
def deep(mode, layer, units, reshape=None):
global idx
idx = idx+1
print("Layer: deep" + str(idx) + "_")
if reshape != None:
layer = tf.reshape(layer, reshape)
# layer = tf.layers.dropout(inputs=layer, rate=dropout_rate, training=mode == tf.estimator.ModeKeys.TRAIN)
layer = tf.layers.dense(inputs=layer, units=units, activation=tf.nn.relu)
layer = tf.layers.batch_normalization(layer, training=True)
return layer
In [ ]:
def model_fn(features, labels, mode):
"""Neural Network Model."""
with tf.device("/gpu:0"):
# Input Layer
initial = tf.reshape(features["x"], [-1, 70])
num_outputs = 11
layer = initial
k = [64, 64, 64, 64]
# Convolutional layers
layer = deep(mode, layer, k[0])
layer = deep(mode, layer, k[1])
layer = deep(mode, layer, k[2])
layer = deep(mode, layer, k[3])
# Logits Layer (there are 11 possible outputs)
logits = tf.layers.dense(inputs=layer, units=num_outputs, name="last_layer")
predictions = {
# Generate predictions (for PREDICT and EVAL mode)
"classes": tf.argmax(input=logits, axis=1),
# Add `softmax_tensor` to the graph. It is used for PREDICT and by the
# `logging_hook`.
"probabilities": tf.nn.softmax(logits, name="softmax_tensor")
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Calculate Loss (for both TRAIN and EVAL modes)
onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_outputs)
loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)
tf.summary.scalar('loss', loss)
tf.summary.merge_all()
# Configure the Training Op (for TRAIN mode)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.AdamOptimizer()
train_op = optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# Add evaluation metrics (for EVAL mode)
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(
labels=labels, predictions=predictions["classes"])}
return tf.estimator.EstimatorSpec(
mode=mode,
loss=loss,
eval_metric_ops=eval_metric_ops
)
In [ ]:
# We'll use this to perform the training
def trainTheModel(train_data, train_labels, eval_data, eval_labels):
global idx
# Create the Estimator
session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
session_config.gpu_options.per_process_gpu_memory_fraction = 0.75
run_config = tf.estimator.RunConfig()
run_config = run_config.replace(
save_checkpoints_steps=100,
session_config=session_config,
keep_checkpoint_max=100)
estimator = tf.estimator.Estimator(
model_fn=model_fn, model_dir=model_dir, config=run_config)
# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": train_data},
y=train_labels,
batch_size=100,
num_epochs=None,
shuffle=True)
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x={"x": eval_data},
y=eval_labels,
batch_size=100,
num_epochs=None,
shuffle=True)
summary_hook = tf.train.SummarySaverHook(
100,
output_dir=model_dir,
scaffold=tf.train.Scaffold())
for epoch in range(10):
# train
idx=0
estimator.train(
input_fn=train_input_fn,
steps=500, hooks=[summary_hook])
tf.reset_default_graph()
idx=0
estimator.evaluate(input_fn=eval_input_fn, steps=10)
In [ ]:
# Start the training
trainTheModel(train_data, train_labels, eval_data, eval_labels)
In [ ]: