Train Model with XLA_CPU (and CPU*)

Some operations do not have XLA_CPU equivalents, so we still need to use CPU.


In [ ]:
import tensorflow as tf

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

tf.logging.set_verbosity(tf.logging.INFO)

Reset TensorFlow Graph

Useful in Jupyter Notebooks


In [ ]:
tf.reset_default_graph()

Create TensorFlow Session


In [ ]:
config = tf.ConfigProto(
  log_device_placement=True,
)

config.graph_options.optimizer_options.global_jit_level \
  = tf.OptimizerOptions.ON_1

print(config)

sess = tf.Session(config=config)
print(sess)

Generate Model Version (current timestamp)


In [ ]:
from datetime import datetime 

version = int(datetime.now().strftime("%s"))

Load Model Training and Test/Validation Data


In [ ]:
num_samples = 100000

In [ ]:
import numpy as np
import pylab

x_train = np.random.rand(num_samples).astype(np.float32)
print(x_train)

noise = np.random.normal(scale=0.01, size=len(x_train))

y_train = x_train * 0.1 + 0.3 + noise
print(y_train)

pylab.plot(x_train, y_train, '.')

In [ ]:
x_test = np.random.rand(len(x_train)).astype(np.float32)
print(x_test)

noise = np.random.normal(scale=.01, size=len(x_train))

y_test = x_test * 0.1 + 0.3 + noise
print(y_test)

pylab.plot(x_test, y_test, '.')

In [ ]:
with tf.device("/cpu:0"):
    W = tf.get_variable(shape=[], name='weights')
    print(W)

    b = tf.get_variable(shape=[], name='bias')
    print(b)

with tf.device("/device:XLA_CPU:0"):            
    x_observed = tf.placeholder(shape=[None], 
                                dtype=tf.float32, 
                                name='x_observed')
    print(x_observed)

    y_pred = W * x_observed + b
    print(y_pred)

In [ ]:
learning_rate = 0.025

with tf.device("/device:XLA_CPU:0"):
    y_observed = tf.placeholder(shape=[None], dtype=tf.float32, name='y_observed')
    print(y_observed)

    loss_op = tf.reduce_mean(tf.square(y_pred - y_observed))
    optimizer_op = tf.train.GradientDescentOptimizer(learning_rate)
    train_op = optimizer_op.minimize(loss_op)  

    print("Loss Scalar: ", loss_op)
    print("Optimizer Op: ", optimizer_op)
    print("Train Op: ", train_op)

Randomly Initialize Variables (Weights and Bias)

The goal is to learn more accurate Weights and Bias during training.


In [ ]:
with tf.device("/device:XLA_CPU:0"):
    init_op = tf.global_variables_initializer()
    print(init_op)

In [ ]:
sess.run(init_op)
print("Initial random W: %f" % sess.run(W))
print("Initial random b: %f" % sess.run(b))

View Accuracy of Pre-Training, Initial Random Variables

We want this to be close to 0, but it's relatively far away. This is why we train!


In [ ]:
def test(x, y):
    return sess.run(loss_op, feed_dict={x_observed: x, y_observed: y})

In [ ]:
test(x_train, y_train)

Setup Loss Summary Operations for Tensorboard


In [ ]:
loss_summary_scalar_op = tf.summary.scalar('loss', loss_op)
loss_summary_merge_all_op = tf.summary.merge_all()

In [ ]:
train_summary_writer = tf.summary.FileWriter('/root/tensorboard/linear/xla_cpu/%s/train' % version, 
                                            graph=tf.get_default_graph())

test_summary_writer = tf.summary.FileWriter('/root/tensorboard/linear/xla_cpu/%s/test' % version,
                                            graph=tf.get_default_graph())

Train Model


In [ ]:
%%time

from tensorflow.python.client import timeline

with tf.device("/device:XLA_CPU:0"):
    run_metadata = tf.RunMetadata()
    max_steps = 401
    for step in range(max_steps):
        if (step < max_steps - 1):
            test_summary_log, _ = sess.run([loss_summary_merge_all_op, loss_op], feed_dict={x_observed: x_test, y_observed: y_test})
            train_summary_log, _ = sess.run([loss_summary_merge_all_op, train_op], feed_dict={x_observed: x_train, y_observed: y_train})
        else:  
            test_summary_log, _ = sess.run([loss_summary_merge_all_op, loss_op], feed_dict={x_observed: x_test, y_observed: y_test})
            train_summary_log, _ = sess.run([loss_summary_merge_all_op, train_op], feed_dict={x_observed: x_train, y_observed: y_train}, 
                                            options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), 
                                            run_metadata=run_metadata)

            trace = timeline.Timeline(step_stats=run_metadata.step_stats)    
            with open('timeline-xla-cpu.json', 'w') as trace_file:
                trace_file.write(trace.generate_chrome_trace_format(show_memory=True))

        if step % 10 == 0:
            print(step, sess.run([W, b]))
            train_summary_writer.add_summary(train_summary_log, step)
            train_summary_writer.flush()
            test_summary_writer.add_summary(test_summary_log, step)
            test_summary_writer.flush()

In [ ]:
pylab.plot(x_train, y_train, '.', label="target")
pylab.plot(x_train, sess.run(y_pred, 
                             feed_dict={x_observed: x_train, 
                                        y_observed: y_train}), 
           ".", 
           label="predicted")
pylab.legend()
pylab.ylim(0, 1.0)

View Loss Summaries in Tensorboard

Navigate to the Scalars and Graphs tab at this URL:

http://[ip-address]:6006

Save Graph For Optimization

We will use this later.


In [ ]:
import os
optimize_me_parent_path = '/root/models/optimize_me/linear/xla_cpu'

saver = tf.train.Saver()

os.system('rm -rf %s' % optimize_me_parent_path)
os.makedirs(optimize_me_parent_path)

unoptimized_model_graph_path = '%s/unoptimized_xla_cpu.pb' % optimize_me_parent_path
tf.train.write_graph(sess.graph_def, 
                     '.', 
                     unoptimized_model_graph_path,
                     as_text=False) 
print(unoptimized_model_graph_path)

model_checkpoint_path = '%s/model.ckpt' % optimize_me_parent_path
saver.save(sess, 
           save_path=model_checkpoint_path)
print(model_checkpoint_path)

In [ ]:
print(optimize_me_parent_path)
os.listdir(optimize_me_parent_path)

In [ ]:
sess.close()

Show Graph


In [ ]:
%%bash

summarize_graph --in_graph=/root/models/optimize_me/linear/xla_cpu/unoptimized_xla_cpu.pb

In [ ]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
from google.protobuf import text_format
from tensorflow.core.framework import graph_pb2

def convert_graph_to_dot(input_graph, output_dot, is_input_graph_binary):
    graph = graph_pb2.GraphDef()
    with open(input_graph, "rb") as fh:
        if is_input_graph_binary:
            graph.ParseFromString(fh.read())
        else:
            text_format.Merge(fh.read(), graph)
    with open(output_dot, "wt") as fh:
        print("digraph graphname {", file=fh)
        for node in graph.node:
            output_name = node.name
            print("  \"" + output_name + "\" [label=\"" + node.op + "\"];", file=fh)
            for input_full_name in node.input:
                parts = input_full_name.split(":")
                input_name = re.sub(r"^\^", "", parts[0])
                print("  \"" + input_name + "\" -> \"" + output_name + "\";", file=fh)
        print("}", file=fh)
        print("Created dot file '%s' for graph '%s'." % (output_dot, input_graph))

In [ ]:
input_graph='/root/models/optimize_me/linear/xla_cpu/unoptimized_xla_cpu.pb'
output_dot='/root/notebooks/unoptimized_xla_cpu.dot'
convert_graph_to_dot(input_graph=input_graph, output_dot=output_dot, is_input_graph_binary=True)

In [ ]:
%%bash

dot -T png /root/notebooks/unoptimized_xla_cpu.dot \
    -o /root/notebooks/unoptimized_xla_cpu.png > /tmp/a.out

In [ ]:
from IPython.display import Image

Image('/root/notebooks/unoptimized_xla_cpu.png', width=1024, height=768)

View XLA JIT Visualizations

Run the next cell and click on the hlo_graph_*.png files in the left-navigation.


In [ ]:
%%bash

dot -T png /tmp/hlo_graph_1.*.dot -o /root/notebooks/hlo_graph_1.png &>/dev/null
dot -T png /tmp/hlo_graph_10.*.dot -o /root/notebooks/hlo_graph_10.png &>/dev/null
dot -T png /tmp/hlo_graph_50.*.dot -o /root/notebooks/hlo_graph_50.png &>/dev/null
dot -T png /tmp/hlo_graph_75.*.dot -o /root/notebooks/hlo_graph_75.png &>/dev/null

In [ ]: