In [1]:
#Based on https://github.com/aymericdamien/TensorFlow-Examples/blob/master/notebooks/2_BasicModels/linear_regression.ipynb
import tensorflow as tf
import numpy
import csv
import datetime
rng = numpy.random

In [2]:
# Parameters
learning_rate = 0.003
training_epochs = 1000
display_step = 50
#We turned this off before becuase it appeared to not be necessary, but it turns out it can bounce back up
#if it's off.
DO_NORMALIZE = True

In [3]:
#Training data
csv = numpy.genfromtxt("solves.csv", skip_header=1, delimiter=",")
train_difficulty = csv[:, 0]
train_features = csv[:, 1:]

In [4]:
#number of training samples
n_samples = train_difficulty.shape[0]

#number of features
feature_length = train_features.shape[1]

In [5]:
#Normalize all of the columns to be between -0.5 and 0.5 with a smooth distribution
if DO_NORMALIZE:
    #TODO: we need to save the maxVal so other folks in the model can normalize appropriately
    #TODO: figure out a better way to normalize these; right now many of them are squishing at -0.5 since their
    #distributions are often very left-skewed
    print("Normalizing")
    for colNum in xrange(0, feature_length):
        col = train_features[:,colNum]
        maxVal = max(col)
        if maxVal == 0:
            maxVal = 1
        for index in xrange(0, len(col)):
            col[index] = (col[index] - (maxVal / 2)) / maxVal
else:
    print("Skipping normalizing")


Normalizing

In [6]:
# tf Graph Input
Difficulty = tf.placeholder("float", shape=(None))
Features = tf.placeholder("float", shape=(None, feature_length))

# Set model weights
W = tf.Variable(tf.random_normal([feature_length, 1], stddev=0.25), name="weight")
b = tf.Variable(rng.randn(), name="bias")

In [7]:
# Construct a linear model

#Got this calculation from http://stackoverflow.com/questions/33698510/use-attribute-and-target-matrices-for-tensorflow-linear-regression-python‘
pred = tf.add(tf.matmul(Features, W), b)

In [8]:
#TODO: calculate R2 and output it over time
# Mean squared error
with tf.name_scope("cost") as scope:
    
    cost = tf.reduce_sum(tf.square(Difficulty - pred))
    cost_summ = tf.scalar_summary("cost", cost)
# Gradient descent
with tf.name_scope("optimize") as scope:
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

In [9]:
#Summary ops to collect data
#The use of tensorboard is adapted from the example on https://www.tensorflow.org/versions/r0.7/how_tos/summaries_and_tensorboard/index.html
w_hist = tf.histogram_summary("weights", W)
b_hist = tf.histogram_summary("biases", b)
difficulty_hist = tf.histogram_summary("difficulty", Difficulty)

In [ ]:
# Initializing the variables
init = tf.initialize_all_variables()

In [ ]:
# Launch the graph

#TODO: do STOCHASTIC gradient descent (or Adagrad optimizer?)
#TODO: consider doing L2 Regularization
#TODO: consider doing drop out
with tf.Session() as sess:

    merged = tf.merge_all_summaries()
    runname = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    writer = tf.train.SummaryWriter("tmp/linear_logs/" + runname, sess.graph_def)

    sess.run(init)

    # Fit all training data
    for epoch in range(training_epochs):
        for (features, difficulty) in zip(train_features, train_difficulty):
            sess.run(optimizer, feed_dict={Features: [features], Difficulty: [difficulty]})

        #Display logs per epoch step
        if (epoch+1) % display_step == 0:
            result = sess.run([merged,cost], feed_dict={Features: train_features, Difficulty: train_difficulty})
            summary_str = result[0]
            c = result[1]
            writer.add_summary(summary_str, epoch)
            print "Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(c), \
                "W=", sess.run(W), "b=", sess.run(b)

    print "Optimization Finished!"
    training_cost = sess.run(cost, feed_dict={Features: train_features, Difficulty: train_difficulty})
    print "Training cost=", training_cost, "W=", sess.run(W), "b=", sess.run(b), '\n'

In [ ]:
#TODO: output the model with bias, cost, and weights all zipped up with their names