In [ ]:
#
# Practice problem!  yay
#
# Our mission, should we choose to accept it, is to see if we can use supervised machine learning to train
# a deep neural network to count the number of vowels in words it has never seen before, without ever telling
# neural network which letters are vowels!
#
# This is best done algorithmically with a few lines of code and no neural networks
# but that doesn't matter - it's a practice problem and anything goes :)  
# Also, we only count a,e,i,o,u as vowels (sorry about that y, nothing personal).
#
# The data set consists of just over 10k words in random order.  Each word has a maximum of 10 characters.
# The first row is the header, and there are five columns (the first is the row index):
#
#   1st column:         - the row index
#   2nd column:  a2i    - the word, in english
#   3rd column:  vowels - the number of vowels in the word
#   4th column:  binary - the word, encoded into 70 binary digits, left-padded with 0, with 7 bits per character.
#                         For example, the word 'vital' is 
#                         encoded as 0000000000000000000000000000000000011101101101001111010011000011101100
#


# Special thanks to the UCI Machine Learning Repository who provided the data set that this one was based on
# For similar datasets, please see https://archive.ics.uci.edu/ml/datasets/bag+of+words
# Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

In [ ]:
# Generate the dataset
import pandas as pd

# Read in the original collection of words
df = pd.read_csv('vocab.nips.txt')

# Randomize the list
df = df.sample(frac=1).reset_index(drop=True)

# Keep the words that have 10 or fewer characters
df = df[df.a2i.str.len() <= 10]

# Count the number of vowels in each word and store them in the 'vowels' column.
def countVowels(row):
    s = row['a2i']
    return s.count('a') + s.count('e') + s.count('i') + s.count('o') + s.count('u')
df['vowels'] = df.apply(countVowels, axis=1)

# Create a column to hold the binary representation of each word (using ASCII 0s and 1s to represent the ASCII 7-bit binary code of each letter)
df['binary'] = df.apply(lambda row: ''.join(format(ord(x), 'b') for x in row[0]).rjust(70, '0'), axis=1)

# Save the new dataset as a file
df.to_csv('vocab.vowels.csv')

In [ ]:
# This cell loads the data set and splits it into numpy arrays of labels and examples, 
# for both training and evalutation.

# The train_data, and train_labels are used for training the neural net.  The train_words is just there for convenience.

import pandas as pd
import numpy as np

df = pd.read_csv('vocab.vowels.csv', names=['id', 'word', 'vowels', 'binary'], index_col='id', skiprows=1)
rows = df.shape[0]
train_rows = int(rows * 0.7)

df_train, df_eval = df.iloc[:train_rows, :], df.iloc[train_rows:, :]

train_list = [[int(c) for c in i] for i in df_train['binary']]
eval_list  = [[int(c) for c in i] for i in df_eval['binary'] ]

train_data   = np.array(train_list).astype(np.float32)
eval_data    = np.array(eval_list).astype(np.float32)

train_words  = df_train.iloc[:,0].values
eval_words   = df_eval.iloc[:,0].values

train_labels = df_train.iloc[:,1].values.astype(np.float32)
eval_labels  = df_eval.iloc[:,1].values.astype(np.float32)

print("Total rows:      " + str(rows))
print("Training rows:   " + str(train_rows))
print("Evaluation rows: " + str(rows - train_rows))

In [ ]:
# Let's just print some example data
i = 1131
print(train_words[i])
print(train_labels[i])
print(train_data[i])

In [ ]:
import os
import tensorflow as tf
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector

tf.logging.set_verbosity(tf.logging.INFO)

dropout_rate = 0.25



def convolution(mode, input_layer, filters, kernel_size, strides=1, padding="VALID", normalize=False):
    """
    Creates a convolutional layer with the given number of 
    filters and kernel size, with optional batch normalization
    """
    layer = tf.layers.conv1d(
          inputs=input_layer,
          kernel_size=kernel_size,
          filters=filters,
          strides=strides,
          padding=padding,
          activation=tf.nn.relu)
    if normalize:
        layer = tf.layers.batch_normalization(layer, training=True)
    return layer


def deep(mode, layer, units, reshape=None):
    """
    Creates a deep layer with dropout and batch normalization
    """
    if reshape != None:
        layer = tf.reshape(layer, reshape)
    layer = tf.layers.dropout(inputs=layer, rate=dropout_rate, training=mode == tf.estimator.ModeKeys.TRAIN)
    layer = tf.layers.dense(inputs=layer, units=units, activation=tf.nn.relu)
    layer = tf.layers.batch_normalization(layer, training=True)
    return layer

In [ ]:
def model_fn(features, labels, mode):
    """Creates the neural network model"""

    with tf.device("/gpu:0"):

        # Input Layer
        input = tf.reshape(features["x"], [-1, 70])
        num_outputs = 11

        # Send the input through fully connected layers (FC layers)  #deeplearning
        layer1  = deep(mode, input, 128)
        layer2  = deep(mode, layer1, 64)
        layer3  = deep(mode, layer2, 32)
        
        
        # Classification Layer (there are 11 possible outputs)
        logits = tf.layers.dense(inputs=layer3, units=num_outputs, name="output_layer")
        
        output = {
          "classes"      : tf.argmax(input=logits, axis=1),
          "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
          "layer1"       : layer1, 
          "layer2"       : layer2,
          "layer3"       : layer3
        }
        

        loss=None
        train_op = None
        eval_metric_ops = None
        
        # i.e. for both TRAIN and EVAL modes
        if mode != tf.estimator.ModeKeys.PREDICT:

            onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_outputs)

            # Calculate the loss (using cross-entropy)
            loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)

            # Minimize the loss
            train_op = tf.train.GradientDescentOptimizer(0.01).minimize(
                loss=loss, global_step=tf.train.get_global_step())
            
            # Gather some metrics
            tf.summary.scalar('loss', loss)
            
            eval_metric_ops = {
              "accuracy": tf.metrics.accuracy(labels=labels, predictions=output["classes"])
            }

        tf.summary.merge_all()
        
        return tf.estimator.EstimatorSpec(mode=mode, 
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metric_ops=eval_metric_ops,
                                          predictions=output)

In [ ]:
model_dir = "demo10"


input_fn = {
        # Train the model
    "training" : tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=64,
        num_epochs=None,
        shuffle=True),
    
    "evaluation" : tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        batch_size=1024,
        num_epochs=None,
        shuffle=True),

    "prediction" : tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        batch_size=len(eval_data),
        num_epochs=1,
        shuffle=False)

}

def trainTheModel(input_fn):
    """
    Expects an input_fn dict containing tensorflow input functions with the names
    "training", "evaluation", "prediction"
    """
    # Create the Estimator
    session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
    session_config.gpu_options.per_process_gpu_memory_fraction = 0.75
    
    run_config = tf.estimator.RunConfig()
    run_config = run_config.replace(
        save_checkpoints_steps=200, 
        session_config=session_config,
        keep_checkpoint_max=100)

    estimator = tf.estimator.Estimator(
        model_fn=model_fn, model_dir=model_dir, config=run_config)

    
    for _ in range(5):
        estimator.train(input_fn=input_fn["training"], steps=1000)
        estimator.evaluate(input_fn=input_fn["evaluation"], steps=1)
        
    
trainTheModel(input_fn)
print("Done")

In [ ]:
# Strategy:
# Use the estimator to run predictions with 500 elements from the set.
# Get back an iterator of the predictions
# Extract prediction tensors (the embeddings of various layers) as a bunch of numpy arrays
# Create a new session (and new graph).
# Create tf.Variable objects in the new session, initialized to the numpy arrays
# Add the variables to a projector config
# Run the session (try tf.global_variables_initializer() or a variant)
# Save the graph and a checkpoint
# View them in TensorBoard


samples = 1500

# We'll use this to perform the training
path = os.path.join(model_dir, "outputs")
if not os.path.exists(path):
    os.makedirs(path)
    
with open(os.path.join(path, 'labels.tsv'),'w') as f:
    f.write("Index\tLabel\tVowels\tLetters\n")
    for index,label in enumerate(eval_labels):
        if index >= samples:
            break
        f.write("%d\t%s\t%d\t%d\n" % (index,eval_words[index]+str(label),int(label),len(eval_words[index])))
        

def save_outputs(input_fn, model_fn, model_dir, layer_names, samples):

    # Create the estimator
    estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir)

    # Run a prediction
    prediction = estimator.predict(input_fn=input_fn)
    

    # Loop through the results and create a dict of the example outputs from each desired layer
    i = 0
    layers = {}
    for name in layer_names:
        layers[name] = []
    for p in prediction:
        i = i + 1
        if i > samples:
            break;
        for name in layer_names:
            layers[name].append(p[name])
    
    # Convert to a numpy array, 
    with tf.Graph().as_default() as g:
        with tf.Session() as sess:
            tfvars = []
            for name in layer_names:
                v = tf.get_variable(name, np.shape(layers[name]), initializer=tf.zeros_initializer)
                va = tf.assign(v,  np.array(layers[name]), name=name)
                tfvars.append(va)
            print(tfvars[0])
            sess.run(tfvars)
            tf.train.Saver().save(sess, os.path.join(path, "outputs.ckpt"))

    


save_outputs(input_fn["prediction"], model_fn, model_dir, ["layer1", "layer2", "layer3"], samples)
print("Done")

In [ ]:
#
#        # Convolutional layers
#        conv0 = convolution(mode, input, 52, 7, strides=7, padding="VALID")
#        flat  = tf.reshape(conv0, [-1, 520])   # Reshape into a flat array