In [1]:
# Import the required packages
################
### PREAMBLE ###
################

from __future__ import division
import tensorflow as tf
import numpy as np
import tarfile
import os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
import time

In [2]:
###################
### IMPORT DATA ###
###################

def csv_to_numpy_array(filePath, delimiter):
    return np.genfromtxt(filePath, delimiter=delimiter, dtype=None)

def import_data():
    if "data" not in os.listdir(os.getcwd()):
        # Untar directory of data if we haven't already
        tarObject = tarfile.open("data.tar.gz")
        tarObject.extractall()
        tarObject.close()
        print("Extracted tar to current directory")
    else:
        # we've already extracted the files
        pass

    print("loading training data")
    trainX = csv_to_numpy_array("data/trainX.csv", delimiter="\t")
    trainY = csv_to_numpy_array("data/trainY.csv", delimiter="\t")
    print("loading test data")
    testX = csv_to_numpy_array("data/testX.csv", delimiter="\t")
    testY = csv_to_numpy_array("data/testY.csv", delimiter="\t")
    return trainX,trainY,testX,testY

trainX,trainY,testX,testY = import_data()
print("data loaded")


loading training data
loading test data
data loaded

In [3]:
#########################
### GLOBAL PARAMETERS ###
#########################

## DATA SET PARAMETERS
# Get our dimensions for our different variables and placeholders:
# numFeatures = the number of words extracted from each email
numFeatures = trainX.shape[1]
# numLabels = number of classes we are predicting (here just 2: Ham or Spam)
numLabels = trainY.shape[1]

## TRAINING SESSION PARAMETERS
# number of times we iterate through training data
# tensorboard shows that accuracy plateaus at ~25k epochs
numEpochs = 26000
# a smarter learning rate for gradientOptimizer
learningRate = tf.train.exponential_decay(learning_rate=0.001,
                                          global_step= 1,
                                          decay_steps=trainX.shape[0],
                                          decay_rate= 0.95,
                                          staircase=True)

In [4]:
####################
### PLACEHOLDERS ###
####################

# X = X-matrix / feature-matrix / data-matrix... It's a tensor to hold our email
# data. 'None' here means that we can hold any number of emails
X = tf.placeholder(tf.float32, [None, numFeatures])
# yGold = Y-matrix / label-matrix / labels... This will be our correct answers
# matrix. Every row has either [1,0] for SPAM or [0,1] for HAM. 'None' here 
# means that we can hold any number of emails
yGold = tf.placeholder(tf.float32, [None, numLabels])

In [5]:
#################
### VARIABLES ###
#################

# Values are randomly sampled from a Gaussian with a standard deviation of:
#     sqrt(6 / (numInputNodes + numOutputNodes + 1))
numHidden = 3

weights = tf.Variable(tf.random_normal([numHidden,numLabels],
                                       mean=0,
                                       stddev=(np.sqrt(6/numFeatures+
                                                         numLabels+1)),
                                       name="weights"))

weights_h = tf.Variable(tf.random_normal([numFeatures,numHidden],
                                       mean=0,
                                       stddev=(np.sqrt(6/numFeatures+
                                                         numLabels+1)),
                                       name="weights_h"))

bias = tf.Variable(tf.random_normal([1,numLabels],
                                    mean=0,
                                    stddev=(np.sqrt(6/numFeatures+numLabels+1)),
                                    name="bias"))
bias_h = tf.Variable(tf.random_normal([1,numHidden],
                                    mean=0,
                                    stddev=(np.sqrt(6/numFeatures+numLabels+1)),
                                    name="bias_h"))

In [6]:
######################
### PREDICTION OPS ###
######################

# INITIALIZE our weights and biases
init_OP = tf.initialize_all_variables()

# PREDICTION ALGORITHM i.e. FEEDFORWARD ALGORITHM
# With hidden layer

apply_weights_OP_h = tf.matmul(X, weights_h, name="apply_weights_h")
add_bias_OP_h = tf.add(apply_weights_OP_h, bias_h, name="add_bias_h") 
activation_OP_h = tf.nn.sigmoid(add_bias_OP_h, name="activation_h")

apply_weights_OP = tf.matmul(activation_OP_h,weights, name="apply_weights")
add_bias_OP = tf.add(apply_weights_OP, bias, name="add_bias") 
activation_OP = tf.nn.sigmoid(add_bias_OP, name="activation")

################################################################

In [7]:
#####################
### EVALUATION OP ###
#####################

# COST FUNCTION i.e. MEAN SQUARED ERROR
cost_OP = tf.nn.l2_loss(activation_OP-yGold, name="squared_error_cost")

In [8]:
#######################
### OPTIMIZATION OP ###
#######################

# OPTIMIZATION ALGORITHM i.e. GRADIENT DESCENT
training_OP = tf.train.GradientDescentOptimizer(learningRate).minimize(cost_OP)

In [9]:
###########################
### GRAPH LIVE UPDATING ###
###########################

epoch_values=[]
accuracy_values=[]
cost_values=[]
# Turn on interactive plotting
plt.ion()
# Create the main, super plot
fig = plt.figure()
# Create two subplots on their own axes and give titles
ax1 = plt.subplot("211")
ax1.set_title("TRAINING ACCURACY", fontsize=18)
ax2 = plt.subplot("212")
ax2.set_title("TRAINING COST", fontsize=18)
plt.tight_layout()

#####################
### RUN THE GRAPH ###
#####################

# Create a tensorflow session
sess = tf.Session()

# Initialize all tensorflow variables
sess.run(init_OP)

## Ops for vizualization
# argmax(activation_OP, 1) gives the label our model thought was most likely
# argmax(yGold, 1) is the correct label
correct_predictions_OP = tf.equal(tf.argmax(activation_OP,1),tf.argmax(yGold,1))
# False is 0 and True is 1, what was our average?
accuracy_OP = tf.reduce_mean(tf.cast(correct_predictions_OP, "float"))
# Summary op for regression output
activation_summary_OP = tf.histogram_summary("output", activation_OP)
# Summary op for accuracy
accuracy_summary_OP = tf.scalar_summary("accuracy", accuracy_OP)
# Summary op for cost
cost_summary_OP = tf.scalar_summary("cost", cost_OP)
# Summary ops to check how variables (W, b) are updating after each iteration
weightSummary = tf.histogram_summary("weights", weights.eval(session=sess))
biasSummary = tf.histogram_summary("biases", bias.eval(session=sess))
# Merge all summaries
all_summary_OPS = tf.merge_all_summaries()
# Summary writer
writer = tf.train.SummaryWriter("summary_logs", sess.graph)

# Initialize reporting variables
cost = 0
diff = 1

# Training epochs
for i in range(numEpochs):
    if i > 1 and diff < .0001:
        print("change in cost %g; convergence."%diff)
        break
    else:
        # Run training step
        step = sess.run(training_OP, feed_dict={X: trainX, yGold: trainY})
        # Report occasional stats
        if i % 1000 == 0:
            # Add epoch to epoch_values
            epoch_values.append(i)
            # Generate accuracy stats on test data
            summary_results, train_accuracy, newCost = sess.run(
                [all_summary_OPS, accuracy_OP, cost_OP], 
                feed_dict={X: trainX, yGold: trainY}
            )
            # Add accuracy to live graphing variable
            accuracy_values.append(train_accuracy)
            # Add cost to live graphing variable
            cost_values.append(newCost)
            # Write summary stats to writer
            writer.add_summary(summary_results, i)
            # Re-assign values for variables
            diff = abs(newCost - cost)
            cost = newCost

            #generate print statements
            print("step %d, training accuracy %g"%(i, train_accuracy))
            print("step %d, cost %g"%(i, newCost))
            print("step %d, change in cost %g"%(i, diff))

            # Plot progress to our two subplots
            accuracyLine, = ax1.plot(epoch_values, accuracy_values)
            costLine, = ax2.plot(epoch_values, cost_values)
            fig.canvas.draw()
            time.sleep(1)


# How well do we perform on held-out test data?
print("final accuracy on test set: %s" %str(sess.run(accuracy_OP, 
                                                     feed_dict={X: testX, 
                                                                yGold: testY})))


step 0, training accuracy 0.533054
step 0, cost 444.446
step 0, change in cost 444.446
step 1000, training accuracy 0.593914
step 1000, cost 227.103
step 1000, change in cost 217.342
step 2000, training accuracy 0.775446
step 2000, cost 202.435
step 2000, change in cost 24.6689
step 3000, training accuracy 0.875131
step 3000, cost 157.509
step 3000, change in cost 44.9258
step 4000, training accuracy 0.9234
step 4000, cost 112.177
step 4000, change in cost 45.3318
step 5000, training accuracy 0.942288
step 5000, cost 80.4938
step 5000, change in cost 31.6831
step 6000, training accuracy 0.959077
step 6000, cost 60.8295
step 6000, change in cost 19.6644
step 7000, training accuracy 0.96852
step 7000, cost 48.5204
step 7000, change in cost 12.3091
step 8000, training accuracy 0.96957
step 8000, cost 40.3206
step 8000, change in cost 8.19979
step 9000, training accuracy 0.96957
step 9000, cost 34.4768
step 9000, change in cost 5.84379
step 10000, training accuracy 0.975866
step 10000, cost 30.0802
step 10000, change in cost 4.39662
step 11000, training accuracy 0.977964
step 11000, cost 26.6411
step 11000, change in cost 3.43908
step 12000, training accuracy 0.982162
step 12000, cost 23.8766
step 12000, change in cost 2.76451
step 13000, training accuracy 0.98531
step 13000, cost 21.6092
step 13000, change in cost 2.26739
step 14000, training accuracy 0.98531
step 14000, cost 19.719
step 14000, change in cost 1.89025
step 15000, training accuracy 0.98531
step 15000, cost 18.1203
step 15000, change in cost 1.59864
step 16000, training accuracy 0.98531
step 16000, cost 16.7504
step 16000, change in cost 1.3699
step 17000, training accuracy 0.987408
step 17000, cost 15.5624
step 17000, change in cost 1.18806
step 18000, training accuracy 0.991605
step 18000, cost 14.5207
step 18000, change in cost 1.04167
step 19000, training accuracy 0.991605
step 19000, cost 13.5985
step 19000, change in cost 0.922222
step 20000, training accuracy 0.991605
step 20000, cost 12.7749
step 20000, change in cost 0.823626
step 21000, training accuracy 0.991605
step 21000, cost 12.0337
step 21000, change in cost 0.741185
step 22000, training accuracy 0.991605
step 22000, cost 11.3621
step 22000, change in cost 0.671559
step 23000, training accuracy 0.993704
step 23000, cost 10.75
step 23000, change in cost 0.612108
step 24000, training accuracy 0.993704
step 24000, cost 10.1892
step 24000, change in cost 0.560822
step 25000, training accuracy 0.994753
step 25000, cost 9.67307
step 25000, change in cost 0.516106
final accuracy on test set: 0.980952

In [10]:
##############################
### SAVE TRAINED VARIABLES ###
##############################

# Create Saver
saver = tf.train.Saver()
# Save variables to .ckpt file
# saver.save(sess, "trained_variables.ckpt")

In [14]:
############################
### MAKE NEW PREDICTIONS ###
############################

# Close tensorflow session
sess.close()

Answers

Part A

Please see Part A code

Part B

With 3 hidden layers and a learning rate of 0.001, the algorithm is able to reach a test accuracy of 98% (Similar results with 1 & 2 layers). This is an improvement of 5% over Part A where no hidden layers were used.