In [1]:
# Import the required packages
################
### PREAMBLE ###
################

from __future__ import division
import tensorflow as tf
import numpy as np
import tarfile
import os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
import time

In [2]:
###################
### IMPORT DATA ###
###################

def csv_to_numpy_array(filePath, delimiter):
    return np.genfromtxt(filePath, delimiter=delimiter, dtype=None)

def import_data():
    if "data" not in os.listdir(os.getcwd()):
        # Untar directory of data if we haven't already
        tarObject = tarfile.open("data.tar.gz")
        tarObject.extractall()
        tarObject.close()
        print("Extracted tar to current directory")
    else:
        # we've already extracted the files
        pass

    print("loading training data")
    trainX = csv_to_numpy_array("data/trainX.csv", delimiter="\t")
    trainY = csv_to_numpy_array("data/trainY.csv", delimiter="\t")
    print("loading test data")
    testX = csv_to_numpy_array("data/testX.csv", delimiter="\t")
    testY = csv_to_numpy_array("data/testY.csv", delimiter="\t")
    return trainX,trainY,testX,testY

trainX,trainY,testX,testY = import_data()
print("data loaded")


loading training data
loading test data
data loaded

In [3]:
#########################
### GLOBAL PARAMETERS ###
#########################

## DATA SET PARAMETERS
# Get our dimensions for our different variables and placeholders:
# numFeatures = the number of words extracted from each email
numFeatures = trainX.shape[1]
# numLabels = number of classes we are predicting (here just 2: Ham or Spam)
numLabels = trainY.shape[1]

## TRAINING SESSION PARAMETERS
# number of times we iterate through training data
# tensorboard shows that accuracy plateaus at ~25k epochs
numEpochs = 26000
# a smarter learning rate for gradientOptimizer
learningRate = tf.train.exponential_decay(learning_rate=0.001,
                                          global_step= 1,
                                          decay_steps=trainX.shape[0],
                                          decay_rate= 0.95,
                                          staircase=True)

In [4]:
####################
### PLACEHOLDERS ###
####################

# X = X-matrix / feature-matrix / data-matrix... It's a tensor to hold our email
# data. 'None' here means that we can hold any number of emails
X = tf.placeholder(tf.float32, [None, numFeatures])
# yGold = Y-matrix / label-matrix / labels... This will be our correct answers
# matrix. Every row has either [1,0] for SPAM or [0,1] for HAM. 'None' here 
# means that we can hold any number of emails
yGold = tf.placeholder(tf.float32, [None, numLabels])

In [6]:
#################
### VARIABLES ###
#################

# Values are randomly sampled from a Gaussian with a standard deviation of:
#     sqrt(6 / (numInputNodes + numOutputNodes + 1))

weights = tf.Variable(tf.random_normal([numFeatures,numLabels],
                                       mean=0,
                                       stddev=(np.sqrt(6/numFeatures+
                                                         numLabels+1)),
                                       name="weights"))

bias = tf.Variable(tf.random_normal([1,numLabels],
                                    mean=0,
                                    stddev=(np.sqrt(6/numFeatures+numLabels+1)),
                                    name="bias"))

In [7]:
######################
### PREDICTION OPS ###
######################

# INITIALIZE our weights and biases
init_OP = tf.initialize_all_variables()

# PREDICTION ALGORITHM i.e. FEEDFORWARD ALGORITHM
apply_weights_OP = tf.matmul(X, weights, name="apply_weights")
add_bias_OP = tf.add(apply_weights_OP, bias, name="add_bias") 
activation_OP = tf.nn.sigmoid(add_bias_OP, name="activation")


################################################################
# With hidden layer

#apply_weights_OP_h = tf.matmul(X, weights_h, name="apply_weights_h")
#add_bias_OP_h = tf.add(apply_weights_OP_h, bias_h, name="add_bias_h") 
#activation_OP_h = tf.nn.sigmoid(add_bias_OP_h, name="activation_h")
##activation_OP_h = tf.transpose(activation_OP_h)
#weights = tf.transpose(weights)
#apply_weights_OP = tf.matmul(activation_OP_h,weights, name="apply_weights")
#add_bias_OP = tf.add(apply_weights_OP, bias, name="add_bias") 
#activation_OP = tf.nn.sigmoid(add_bias_OP, name="activation")
#activation_OP = tf.transpose(activation_OP)
################################################################

In [8]:
#####################
### EVALUATION OP ###
#####################

# COST FUNCTION i.e. MEAN SQUARED ERROR
cost_OP = tf.nn.l2_loss(activation_OP-yGold, name="squared_error_cost")

In [9]:
#######################
### OPTIMIZATION OP ###
#######################

# OPTIMIZATION ALGORITHM i.e. GRADIENT DESCENT
training_OP = tf.train.GradientDescentOptimizer(learningRate).minimize(cost_OP)

In [10]:
###########################
### GRAPH LIVE UPDATING ###
###########################

epoch_values=[]
accuracy_values=[]
cost_values=[]
# Turn on interactive plotting
plt.ion()
# Create the main, super plot
fig = plt.figure()
# Create two subplots on their own axes and give titles
ax1 = plt.subplot("211")
ax1.set_title("TRAINING ACCURACY", fontsize=18)
ax2 = plt.subplot("212")
ax2.set_title("TRAINING COST", fontsize=18)
plt.tight_layout()

#####################
### RUN THE GRAPH ###
#####################

# Create a tensorflow session
sess = tf.Session()

# Initialize all tensorflow variables
sess.run(init_OP)

## Ops for vizualization
# argmax(activation_OP, 1) gives the label our model thought was most likely
# argmax(yGold, 1) is the correct label
correct_predictions_OP = tf.equal(tf.argmax(activation_OP,1),tf.argmax(yGold,1))
# False is 0 and True is 1, what was our average?
accuracy_OP = tf.reduce_mean(tf.cast(correct_predictions_OP, "float"))
# Summary op for regression output
activation_summary_OP = tf.histogram_summary("output", activation_OP)
# Summary op for accuracy
accuracy_summary_OP = tf.scalar_summary("accuracy", accuracy_OP)
# Summary op for cost
cost_summary_OP = tf.scalar_summary("cost", cost_OP)
# Summary ops to check how variables (W, b) are updating after each iteration
weightSummary = tf.histogram_summary("weights", weights.eval(session=sess))
biasSummary = tf.histogram_summary("biases", bias.eval(session=sess))
# Merge all summaries
all_summary_OPS = tf.merge_all_summaries()
# Summary writer
writer = tf.train.SummaryWriter("summary_logs", sess.graph)

# Initialize reporting variables
cost = 0
diff = 1

# Training epochs
for i in range(numEpochs):
    if i > 1 and diff < .0001:
        print("change in cost %g; convergence."%diff)
        break
    else:
        # Run training step
        step = sess.run(training_OP, feed_dict={X: trainX, yGold: trainY})
        # Report occasional stats
        if i % 1000 == 0:
            # Add epoch to epoch_values
            epoch_values.append(i)
            # Generate accuracy stats on test data
            summary_results, train_accuracy, newCost = sess.run(
                [all_summary_OPS, accuracy_OP, cost_OP], 
                feed_dict={X: trainX, yGold: trainY}
            )
            # Add accuracy to live graphing variable
            accuracy_values.append(train_accuracy)
            # Add cost to live graphing variable
            cost_values.append(newCost)
            # Write summary stats to writer
            writer.add_summary(summary_results, i)
            # Re-assign values for variables
            diff = abs(newCost - cost)
            cost = newCost

            #generate print statements
            print("step %d, training accuracy %g"%(i, train_accuracy))
            print("step %d, cost %g"%(i, newCost))
            print("step %d, change in cost %g"%(i, diff))

            # Plot progress to our two subplots
            accuracyLine, = ax1.plot(epoch_values, accuracy_values)
            costLine, = ax2.plot(epoch_values, cost_values)
            fig.canvas.draw()
            time.sleep(1)


# How well do we perform on held-out test data?
print("final accuracy on test set: %s" %str(sess.run(accuracy_OP, 
                                                     feed_dict={X: testX, 
                                                                yGold: testY})))


step 0, training accuracy 0.474292
step 0, cost 283.743
step 0, change in cost 283.743
step 1000, training accuracy 0.657922
step 1000, cost 223.803
step 1000, change in cost 59.9399
step 2000, training accuracy 0.70829
step 2000, cost 215.404
step 2000, change in cost 8.39948
step 3000, training accuracy 0.766002
step 3000, cost 207.595
step 3000, change in cost 7.8091
step 4000, training accuracy 0.804827
step 4000, cost 200.334
step 4000, change in cost 7.26073
step 5000, training accuracy 0.852046
step 5000, cost 193.578
step 5000, change in cost 6.75641
step 6000, training accuracy 0.869885
step 6000, cost 187.283
step 6000, change in cost 6.29494
step 7000, training accuracy 0.886674
step 7000, cost 181.409
step 7000, change in cost 5.87367
step 8000, training accuracy 0.901364
step 8000, cost 175.92
step 8000, change in cost 5.48911
step 9000, training accuracy 0.913956
step 9000, cost 170.782
step 9000, change in cost 5.13783
step 10000, training accuracy 0.921301
step 10000, cost 165.966
step 10000, change in cost 4.81645
step 11000, training accuracy 0.930745
step 11000, cost 161.444
step 11000, change in cost 4.52203
step 12000, training accuracy 0.930745
step 12000, cost 157.191
step 12000, change in cost 4.25224
step 13000, training accuracy 0.93809
step 13000, cost 153.186
step 13000, change in cost 4.00493
step 14000, training accuracy 0.941238
step 14000, cost 149.409
step 14000, change in cost 3.77773
step 15000, training accuracy 0.942288
step 15000, cost 145.84
step 15000, change in cost 3.56914
step 16000, training accuracy 0.945435
step 16000, cost 142.462
step 16000, change in cost 3.37744
step 17000, training accuracy 0.945435
step 17000, cost 139.261
step 17000, change in cost 3.20087
step 18000, training accuracy 0.945435
step 18000, cost 136.223
step 18000, change in cost 3.03796
step 19000, training accuracy 0.947534
step 19000, cost 133.336
step 19000, change in cost 2.88747
step 20000, training accuracy 0.948583
step 20000, cost 130.588
step 20000, change in cost 2.74823
step 21000, training accuracy 0.950682
step 21000, cost 127.969
step 21000, change in cost 2.61896
step 22000, training accuracy 0.951731
step 22000, cost 125.47
step 22000, change in cost 2.49895
step 23000, training accuracy 0.95383
step 23000, cost 123.082
step 23000, change in cost 2.38723
step 24000, training accuracy 0.955929
step 24000, cost 120.799
step 24000, change in cost 2.28301
step 25000, training accuracy 0.955929
step 25000, cost 118.614
step 25000, change in cost 2.18559
final accuracy on test set: 0.933333

In [10]:
##############################
### SAVE TRAINED VARIABLES ###
##############################

# Create Saver
saver = tf.train.Saver()
# Save variables to .ckpt file
# saver.save(sess, "trained_variables.ckpt")

In [14]:
############################
### MAKE NEW PREDICTIONS ###
############################

# Close tensorflow session
sess.close()

Answers

Part A

The best test accuracy obtained was 93.3% using a learning rate of 0.001 over 26,000 iterations (epoch). Please see above code for accuracy and cost plots.

Part B

Please see Part B code