In [ ]:
#
# Practice problem!  yay
#
# Our mission, should we choose to accept it, is to see if we can use supervised machine learning to train
# a deep neural network to count the number of vowels in words it has never seen before, without ever telling
# neural network which letters are vowels!
#
# This is best done algorithmically with a few lines of code and no neural networks
# but that doesn't matter - it's a practice problem and anything goes :)  
# Also, we only count a,e,i,o,u as vowels (sorry about that y, nothing personal).
#
# The data set consists of just over 10k words in random order.  Each word has a maximum of 10 characters.
# The first row is the header, and there are five columns (the first is the row index):
#
#   1st column:         - the row index
#   2nd column:  a2i    - the word, in english
#   3rd column:  vowels - the number of vowels in the word
#   4th column:  binary - the word, encoded into 70 binary digits, left-padded with 0, with 7 bits per character.
#                         For example, the word 'vital' is 
#                         encoded as 0000000000000000000000000000000000011101101101001111010011000011101100
#


# Special thanks to the UCI Machine Learning Repository who provided the data set that this one was based on
# For similar datasets, please see https://archive.ics.uci.edu/ml/datasets/bag+of+words
# Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

In [ ]:
# This cell is used to generate the dataset

import pandas as pd

df = pd.read_csv('vocab.nips.txt')
df = df.sample(frac=1).reset_index(drop=True)

def countVowels(row):
    s = row['a2i']
    return s.count('a') + s.count('e') + s.count('i') + s.count('o') + s.count('u')


df['vowels'] = df.apply(countVowels, axis=1)
print(df.shape)
print(df.columns.values)
df = df[df.a2i.str.len() <= 10]
print(df.shape)

df['binary'] = df.apply(lambda row: ''.join(format(ord(x), 'b') for x in row[0]).rjust(70, '0'), axis=1)
df.to_csv('vocab.vowels.txt')

In [ ]:
# This cell loads the data set and splits it into numpy arrays of labels and examples, 
# for both training and evalutation.

# The train_data, and train_labels are used for training the neural net.  The train_words is just there for convenience.

import pandas as pd
import numpy as np

df = pd.read_csv('vocab.vowels.txt', names=['id', 'word', 'vowels', 'binary'], index_col='id', skiprows=1)
rows = df.shape[0]
train_rows = int(rows * 0.7)
print("Total rows:    " + str(rows))
print("Training rows: " + str(train_rows))

df_train, df_eval = df.iloc[:train_rows, :], df.iloc[train_rows:, :]

train_list = []
eval_list = []

for i in df_train['binary']:
    train_list.append([int(c) for c in i])

for i in df_eval['binary']:
    eval_list.append([int(c) for c in i])


train_data = np.array(train_list).astype(np.float32)
eval_data  = np.array(eval_list).astype(np.float32)
train_words = df_train.iloc[:,0].values
eval_words = df_eval.iloc[:,0].values
train_labels = df_train.iloc[:,1].values.astype(np.float32)
eval_labels = df_eval.iloc[:,1].values.astype(np.float32)

In [ ]:
# Let's just print some example data
i = 5432
print(train_words[i])
print(train_labels[i])
print(train_data[i])

In [ ]:
import tensorflow as tf
import numpy as np


tf.reset_default_graph()
tf.logging.set_verbosity(tf.logging.INFO)

idx = 0 # The layer index
dropout_rate = 0.5
model_dir = "results"

def normalize(mode, input):
    return tf.layers.batch_normalization(input, training=True)

def convolution(mode, input_layer, filters, kernel_size, padding="VALID"):
    global idx
    idx = idx+1
    print("Layer: conv" + str(idx))
    
    return normalize(mode, tf.layers.separable_conv2d(
          name="conv" + str(idx) + "_",
          inputs=input_layer,
          filters=filters,
          kernel_size=kernel_size,
          padding=padding,
          activation=tf.nn.relu))
                     
def pool(mode, input_layer, pool_size=[2,2], strides=[2,2]):
    global idx
    idx = idx+1
    print("Layer: pool" + str(idx) + "_")
    
    return tf.layers.max_pooling2d(inputs=input_layer, pool_size=pool_size, strides=strides, name="pool" + str(idx))

def deep(mode, layer, units, reshape=None):
    global idx
    idx = idx+1
    print("Layer: deep" + str(idx) + "_")
    
    if reshape != None:
        layer = tf.reshape(layer, reshape)
#     layer = tf.layers.dropout(inputs=layer, rate=dropout_rate, training=mode == tf.estimator.ModeKeys.TRAIN)
    layer = tf.layers.dense(inputs=layer, units=units, activation=tf.nn.relu)
    layer = tf.layers.batch_normalization(layer, training=True)
    return layer

In [ ]:
def model_fn(features, labels, mode):
    """Neural Network Model."""
    with tf.device("/gpu:0"):

        # Input Layer
        initial = tf.reshape(features["x"], [-1, 70])
        num_outputs = 11

        layer = initial
        
        k  = [64, 64, 64, 64]

        # Convolutional layers
        layer = deep(mode, layer, k[0])
        layer = deep(mode, layer, k[1])
        layer = deep(mode, layer, k[2])
        layer = deep(mode, layer, k[3])
        
        
        # Logits Layer (there are 11 possible outputs)
        logits = tf.layers.dense(inputs=layer, units=num_outputs, name="last_layer")

        predictions = {
          # Generate predictions (for PREDICT and EVAL mode)
          "classes": tf.argmax(input=logits, axis=1),
          # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
          # `logging_hook`.
          "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
        }
        
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
        
        # Calculate Loss (for both TRAIN and EVAL modes)
        onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_outputs)
        loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)

        tf.summary.scalar('loss', loss)
        tf.summary.merge_all()
        
        # Configure the Training Op (for TRAIN mode)
        if mode == tf.estimator.ModeKeys.TRAIN:
            optimizer = tf.train.AdamOptimizer()
            train_op = optimizer.minimize(
                loss=loss,
                global_step=tf.train.get_global_step())
            return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

        # Add evaluation metrics (for EVAL mode)
        
        
        eval_metric_ops = {
          "accuracy": tf.metrics.accuracy(
              labels=labels, predictions=predictions["classes"])}
        return tf.estimator.EstimatorSpec(
            mode=mode, 
            loss=loss, 
            eval_metric_ops=eval_metric_ops
        )

In [ ]:
# We'll use this to perform the training

def trainTheModel(train_data, train_labels, eval_data, eval_labels):
    global idx
    # Create the Estimator
    session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
    session_config.gpu_options.per_process_gpu_memory_fraction = 0.75
    
    run_config = tf.estimator.RunConfig()
    run_config = run_config.replace(
        save_checkpoints_steps=100, 
        session_config=session_config,
        keep_checkpoint_max=100)

    estimator = tf.estimator.Estimator(
        model_fn=model_fn, model_dir=model_dir, config=run_config)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)
    
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)
    
    summary_hook = tf.train.SummarySaverHook(
        100,
        output_dir=model_dir,
        scaffold=tf.train.Scaffold())
    
    for epoch in range(10):
        # train
        idx=0
        estimator.train(
            input_fn=train_input_fn,
            steps=500, hooks=[summary_hook])
        tf.reset_default_graph()
        idx=0
        estimator.evaluate(input_fn=eval_input_fn, steps=10)

In [ ]:
# Start the training 
trainTheModel(train_data, train_labels, eval_data, eval_labels)

In [ ]: