In [ ]:
#
# Practice problem! yay
#
# Our mission, should we choose to accept it, is to see if we can use supervised machine learning to train
# a deep neural network to count the number of vowels in words it has never seen before, without ever telling
# neural network which letters are vowels!
#
# This is best done algorithmically with a few lines of code and no neural networks
# but that doesn't matter - it's a practice problem and anything goes :)
# Also, we only count a,e,i,o,u as vowels (sorry about that y, nothing personal).
#
# The data set consists of just over 10k words in random order. Each word has a maximum of 10 characters.
# The first row is the header, and there are five columns (the first is the row index):
#
# 1st column: - the row index
# 2nd column: a2i - the word, in english
# 3rd column: vowels - the number of vowels in the word
# 4th column: binary - the word, encoded into 70 binary digits, left-padded with 0, with 7 bits per character.
# For example, the word 'vital' is
# encoded as 0000000000000000000000000000000000011101101101001111010011000011101100
#
# Special thanks to the UCI Machine Learning Repository who provided the data set that this one was based on
# For similar datasets, please see https://archive.ics.uci.edu/ml/datasets/bag+of+words
# Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
In [ ]:
# Generate the dataset
import pandas as pd
# Read in the original collection of words
df = pd.read_csv('vocab.nips.txt')
# Randomize the list
df = df.sample(frac=1).reset_index(drop=True)
# Keep the words that have 10 or fewer characters
df = df[df.a2i.str.len() <= 10]
# Count the number of vowels in each word and store them in the 'vowels' column.
def countVowels(row):
s = row['a2i']
return s.count('a') + s.count('e') + s.count('i') + s.count('o') + s.count('u')
df['vowels'] = df.apply(countVowels, axis=1)
# Create a column to hold the binary representation of each word (using ASCII 0s and 1s to represent the ASCII 7-bit binary code of each letter)
df['binary'] = df.apply(lambda row: ''.join(format(ord(x), 'b') for x in row[0]).rjust(70, '0'), axis=1)
# Save the new dataset as a file
df.to_csv('vocab.vowels.csv')
In [ ]:
# This cell loads the data set and splits it into numpy arrays of labels and examples,
# for both training and evalutation.
# The train_data, and train_labels are used for training the neural net. The train_words is just there for convenience.
import pandas as pd
import numpy as np
df = pd.read_csv('vocab.vowels.csv', names=['id', 'word', 'vowels', 'binary'], index_col='id', skiprows=1)
rows = df.shape[0]
train_rows = int(rows * 0.7)
df_train, df_eval = df.iloc[:train_rows, :], df.iloc[train_rows:, :]
train_list = [[int(c) for c in i] for i in df_train['binary']]
eval_list = [[int(c) for c in i] for i in df_eval['binary'] ]
train_data = np.array(train_list).astype(np.float32)
eval_data = np.array(eval_list).astype(np.float32)
train_words = df_train.iloc[:,0].values
eval_words = df_eval.iloc[:,0].values
train_labels = df_train.iloc[:,1].values.astype(np.float32)
eval_labels = df_eval.iloc[:,1].values.astype(np.float32)
print("Total rows: " + str(rows))
print("Training rows: " + str(train_rows))
print("Evaluation rows: " + str(rows - train_rows))
In [ ]:
# Let's just print some example data
i = 1131
print(train_words[i])
print(train_labels[i])
print(train_data[i])
In [ ]:
import os
import tensorflow as tf
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
tf.logging.set_verbosity(tf.logging.INFO)
dropout_rate = 0.25
def convolution(mode, input_layer, filters, kernel_size, strides=1, padding="VALID", normalize=False):
"""
Creates a convolutional layer with the given number of
filters and kernel size, with optional batch normalization
"""
layer = tf.layers.conv1d(
inputs=input_layer,
kernel_size=kernel_size,
filters=filters,
strides=strides,
padding=padding,
activation=tf.nn.relu)
if normalize:
layer = tf.layers.batch_normalization(layer, training=True)
return layer
def deep(mode, layer, units, reshape=None):
"""
Creates a deep layer with dropout and batch normalization
"""
if reshape != None:
layer = tf.reshape(layer, reshape)
layer = tf.layers.dropout(inputs=layer, rate=dropout_rate, training=mode == tf.estimator.ModeKeys.TRAIN)
layer = tf.layers.dense(inputs=layer, units=units, activation=tf.nn.relu)
layer = tf.layers.batch_normalization(layer, training=True)
return layer
In [ ]:
def model_fn(features, labels, mode):
"""Creates the neural network model"""
with tf.device("/gpu:0"):
# Input Layer
input = tf.reshape(features["x"], [-1, 70])
num_outputs = 11
# Send the input through fully connected layers (FC layers) #deeplearning
layer1 = deep(mode, input, 128)
layer2 = deep(mode, layer1, 64)
layer3 = deep(mode, layer2, 32)
# Classification Layer (there are 11 possible outputs)
logits = tf.layers.dense(inputs=layer3, units=num_outputs, name="output_layer")
output = {
"classes" : tf.argmax(input=logits, axis=1),
"probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
"layer1" : layer1,
"layer2" : layer2,
"layer3" : layer3
}
loss=None
train_op = None
eval_metric_ops = None
# i.e. for both TRAIN and EVAL modes
if mode != tf.estimator.ModeKeys.PREDICT:
onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_outputs)
# Calculate the loss (using cross-entropy)
loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)
# Minimize the loss
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(
loss=loss, global_step=tf.train.get_global_step())
# Gather some metrics
tf.summary.scalar('loss', loss)
eval_metric_ops = {
"accuracy": tf.metrics.accuracy(labels=labels, predictions=output["classes"])
}
tf.summary.merge_all()
return tf.estimator.EstimatorSpec(mode=mode,
loss=loss,
train_op=train_op,
eval_metric_ops=eval_metric_ops,
predictions=output)
In [ ]:
model_dir = "demo10"
input_fn = {
# Train the model
"training" : tf.estimator.inputs.numpy_input_fn(
x={"x": train_data},
y=train_labels,
batch_size=64,
num_epochs=None,
shuffle=True),
"evaluation" : tf.estimator.inputs.numpy_input_fn(
x={"x": eval_data},
y=eval_labels,
batch_size=1024,
num_epochs=None,
shuffle=True),
"prediction" : tf.estimator.inputs.numpy_input_fn(
x={"x": eval_data},
y=eval_labels,
batch_size=len(eval_data),
num_epochs=1,
shuffle=False)
}
def trainTheModel(input_fn):
"""
Expects an input_fn dict containing tensorflow input functions with the names
"training", "evaluation", "prediction"
"""
# Create the Estimator
session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
session_config.gpu_options.per_process_gpu_memory_fraction = 0.75
run_config = tf.estimator.RunConfig()
run_config = run_config.replace(
save_checkpoints_steps=200,
session_config=session_config,
keep_checkpoint_max=100)
estimator = tf.estimator.Estimator(
model_fn=model_fn, model_dir=model_dir, config=run_config)
for _ in range(5):
estimator.train(input_fn=input_fn["training"], steps=1000)
estimator.evaluate(input_fn=input_fn["evaluation"], steps=1)
trainTheModel(input_fn)
print("Done")
In [ ]:
# Strategy:
# Use the estimator to run predictions with 500 elements from the set.
# Get back an iterator of the predictions
# Extract prediction tensors (the embeddings of various layers) as a bunch of numpy arrays
# Create a new session (and new graph).
# Create tf.Variable objects in the new session, initialized to the numpy arrays
# Add the variables to a projector config
# Run the session (try tf.global_variables_initializer() or a variant)
# Save the graph and a checkpoint
# View them in TensorBoard
samples = 1500
# We'll use this to perform the training
path = os.path.join(model_dir, "outputs")
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(path, 'labels.tsv'),'w') as f:
f.write("Index\tLabel\tVowels\tLetters\n")
for index,label in enumerate(eval_labels):
if index >= samples:
break
f.write("%d\t%s\t%d\t%d\n" % (index,eval_words[index]+str(label),int(label),len(eval_words[index])))
def save_outputs(input_fn, model_fn, model_dir, layer_names, samples):
# Create the estimator
estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir)
# Run a prediction
prediction = estimator.predict(input_fn=input_fn)
# Loop through the results and create a dict of the example outputs from each desired layer
i = 0
layers = {}
for name in layer_names:
layers[name] = []
for p in prediction:
i = i + 1
if i > samples:
break;
for name in layer_names:
layers[name].append(p[name])
# Convert to a numpy array,
with tf.Graph().as_default() as g:
with tf.Session() as sess:
tfvars = []
for name in layer_names:
v = tf.get_variable(name, np.shape(layers[name]), initializer=tf.zeros_initializer)
va = tf.assign(v, np.array(layers[name]), name=name)
tfvars.append(va)
print(tfvars[0])
sess.run(tfvars)
tf.train.Saver().save(sess, os.path.join(path, "outputs.ckpt"))
save_outputs(input_fn["prediction"], model_fn, model_dir, ["layer1", "layer2", "layer3"], samples)
print("Done")
In [ ]:
#
# # Convolutional layers
# conv0 = convolution(mode, input, 52, 7, strides=7, padding="VALID")
# flat = tf.reshape(conv0, [-1, 520]) # Reshape into a flat array