Chapter 9 – Up and running with TensorFlow

This notebook contains all the sample code and solutions to the exercices in chapter 9.

Setup

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:


In [10]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

import tensorflow as tf

import pandas as pd

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "tensorflow"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

Linear Regression

Using the Normal Equation


In [32]:
beetles_full = pd.read_csv('beetleTrainingData.csv')

# beetles = beetles_full.drop(['accuracy_num', 'accuracy_txt'], axis=1).as_matrix()
beetles_test_train = beetles_full[beetles_full.columns[~beetles_full.columns.str.contains('_RA')]].drop(['accuracy_num', 'accuracy_txt'], axis=1).as_matrix()
# beetles_target = beetles_full['accuracy'].as_matrix()
beetles_target_test_train = beetles_full.accuracy_txt.apply(lambda a: int(a == 'correct')).as_matrix()
# beetles_target = beetles_full.accuracy_num.as_matrix()

pca = PCA(n_components = 50)
beetles_pca_test_train = pca.fit_transform(beetles_test_train)


beetles_pca, beetles_test, beetles_target, beetles_target_test = train_test_split(
    beetles_pca_test_train, beetles_target_test_train, test_size=0.33, random_state=42)


m, n = beetles_pca.shape
beetles_plus_bias = np.c_[np.ones((m, 1)), beetles_pca]

tm, tn = beetles_test.shape
beetles_test_plus_bias = np.c_[np.ones((tm, 1)), beetles_test]


X = beetles_plus_bias
np.linalg.inv(X.T.dot(X))


Out[32]:
array([[  2.73225839e-02,  -9.75049189e-03,  -1.76759532e-02, ...,
          1.90029998e-02,   1.37146714e-01,   4.48831126e-03],
       [ -9.75049189e-03,   7.80041777e-02,   6.32785852e-03, ...,
         -3.77739233e-02,  -4.76679112e-02,  -3.09131276e-05],
       [ -1.76759532e-02,   6.32785852e-03,   1.51287190e-01, ...,
         -2.27059523e-02,  -1.26350919e-01,  -8.04739991e-03],
       ..., 
       [  1.90029998e-02,  -3.77739233e-02,  -2.27059523e-02, ...,
          2.42151152e+01,  -4.91738006e-01,   5.01229961e-01],
       [  1.37146714e-01,  -4.76679112e-02,  -1.26350919e-01, ...,
         -4.91738006e-01,   8.52069674e+00,  -2.05002872e+00],
       [  4.48831126e-03,  -3.09131276e-05,  -8.04739991e-03, ...,
          5.01229961e-01,  -2.05002872e+00,   8.38411381e+00]])

In [25]:
pca.components_


Out[25]:
array([[ -8.05509303e-05,  -8.01759307e-05,  -7.28947850e-05, ...,
         -8.10785890e-05,  -8.95595370e-05,  -9.27203224e-05],
       [ -1.13510464e-04,  -1.13016407e-04,   2.91662205e-05, ...,
         -1.14944374e-04,  -1.26516951e-04,  -1.62868900e-04],
       [ -1.84068568e-04,  -1.73791028e-04,  -3.41208366e-05, ...,
         -1.92779062e-04,  -4.31715415e-04,  -3.95870043e-03],
       ..., 
       [ -1.65747329e-03,   7.20935499e-04,   2.33905365e-04, ...,
          4.78326140e-03,   4.77019624e-03,   1.97231709e-04],
       [  1.17639004e-04,  -3.52419551e-03,   8.25446305e-04, ...,
          6.62346674e-04,  -2.48533337e-03,   1.63687901e-04],
       [ -1.82786651e-03,  -3.86013516e-04,   6.95545187e-03, ...,
         -3.74914642e-03,   1.15167454e-03,   2.92411527e-04]])

In [26]:
reset_graph()

X = tf.constant(beetles_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(beetles_target.reshape(-1, 1), dtype=tf.float32, name="y")
XT = tf.transpose(X)
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

with tf.Session() as sess:
    theta_value = theta.eval()

In [27]:
theta_value


Out[27]:
array([[  8.10518146e-01],
       [ -2.68344522e-01],
       [ -4.24354792e-01],
       [ -6.15563965e+00],
       [  7.33613647e+02],
       [  1.96266174e-02],
       [  6.88719034e-01],
       [  8.20484996e-01],
       [ -4.41275597e-01],
       [  3.35843163e+01],
       [  2.34610538e+01],
       [  2.07214975e+00],
       [  6.95458651e-01],
       [  1.59564900e+00],
       [  4.29012567e-01],
       [ -6.76961517e+00],
       [ -8.00227404e-01],
       [ -4.55432832e-01],
       [ -2.65023828e-01],
       [  1.19006777e+00],
       [ -1.10140121e+00],
       [  5.94193280e-01],
       [  7.17164218e-01],
       [  1.81637824e-01],
       [ -3.66501737e+00],
       [  6.09341741e-01],
       [ -4.80867922e-01],
       [ -2.31503531e-01],
       [ -2.97694951e-02],
       [ -7.86289787e+00],
       [ -2.80813384e+00],
       [  7.02050328e-02],
       [ -9.26078796e-01],
       [ -2.44383383e+00],
       [ -2.84349608e+00],
       [ -5.04284000e+00],
       [ -1.24304140e+00],
       [ -1.21595860e-02],
       [ -3.02489090e+00],
       [  2.57522297e+00],
       [ -6.05385303e-02],
       [  5.02345037e+00],
       [ -2.80797929e-01],
       [  1.65174246e+00],
       [ -9.80728269e-01],
       [ -4.39341277e-01],
       [  2.02667713e+00],
       [  4.11897945e+00],
       [  5.66135526e-01],
       [  1.32151222e+00],
       [  9.61878061e-01]], dtype=float32)

In [28]:
# y_predict = X.dot(theta_best)
y_predict = tf.reduce_sum(tf.multiply(tf.transpose(theta_value), X))
y_predict


Out[28]:
<tf.Tensor 'Sum:0' shape=() dtype=float32>

In [29]:
with tf.Session() as sess:
    print(y_predict.eval())


1127.0

In [34]:
# y_predict = X.dot(theta_best)
y_predict = tf.reduce_sum(tf.multiply(tf.transpose(theta_value), beetles_test_plus_bias))
with tf.Session() as sess:
    print(y_predict.eval())
y_predict


2064.71
Out[34]:
<tf.Tensor 'Sum_2:0' shape=() dtype=float32>

Compare with pure NumPy


In [5]:
X = beetles_plus_bias
y = beetles_target.reshape(-1, 1)
theta_numpy = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)

print(theta_numpy)


[[ 0.4224816 ]
 [-0.11661451]
 [-0.15353952]
 [-0.22230531]
 [-0.207905  ]
 [-0.30958844]
 [-0.18509385]
 [-0.20636342]
 [-0.20782069]
 [-0.05001243]
 [-0.32845625]
 [-0.45019224]
 [-0.22267283]
 [-0.2812736 ]
 [ 0.33541143]
 [-0.33246795]
 [-0.36227557]
 [-0.2695672 ]
 [-0.28696093]
 [-0.36008964]
 [-0.42528477]
 [ 0.33754373]
 [ 0.46243078]
 [-0.25660587]
 [-0.31856083]
 [-0.27500852]
 [-0.14036395]
 [-0.23514211]
 [-0.23724976]
 [-0.21975815]
 [-0.30981518]
 [ 0.4407201 ]
 [-0.31920516]
 [-0.03547005]
 [-0.31143484]
 [-0.17538107]
 [ 0.58229002]
 [ 0.51678289]
 [-0.3813917 ]
 [-0.35417921]
 [-0.09257149]
 [-0.14863034]
 [-0.03998018]
 [-0.00176955]
 [-0.22652041]
 [ 0.19203066]
 [ 0.34934392]
 [-0.35938147]
 [-0.50875196]
 [-0.44930616]
 [-0.2512746 ]]

Compare with Scikit-Learn


In [6]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(beetles_pca, beetles_target.reshape(-1, 1))

print(np.r_[lin_reg.intercept_.reshape(-1, 1), lin_reg.coef_.T])


[[ 0.4224816 ]
 [-0.11661451]
 [-0.15353952]
 [-0.22230531]
 [-0.207905  ]
 [-0.30958844]
 [-0.18509385]
 [-0.20636342]
 [-0.20782069]
 [-0.05001243]
 [-0.32845625]
 [-0.45019224]
 [-0.22267283]
 [-0.2812736 ]
 [ 0.33541143]
 [-0.33246795]
 [-0.36227557]
 [-0.2695672 ]
 [-0.28696093]
 [-0.36008964]
 [-0.42528477]
 [ 0.33754373]
 [ 0.46243078]
 [-0.25660587]
 [-0.31856083]
 [-0.27500852]
 [-0.14036395]
 [-0.23514211]
 [-0.23724976]
 [-0.21975815]
 [-0.30981518]
 [ 0.4407201 ]
 [-0.31920516]
 [-0.03547005]
 [-0.31143484]
 [-0.17538107]
 [ 0.58229002]
 [ 0.51678289]
 [-0.3813917 ]
 [-0.35417921]
 [-0.09257149]
 [-0.14863034]
 [-0.03998018]
 [-0.00176955]
 [-0.22652041]
 [ 0.19203066]
 [ 0.34934392]
 [-0.35938147]
 [-0.50875196]
 [-0.44930616]
 [-0.2512746 ]]

Using Batch Gradient Descent

Gradient Descent requires scaling the feature vectors first. We could do this using TF, but let's just use Scikit-Learn for now.


In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_beetles_data = scaler.fit_transform(beetles_pca)
scaled_beetles_data_plus_bias = np.c_[np.ones((m, 1)), scaled_beetles_data]

In [36]:
print(scaled_beetles_data_plus_bias.mean(axis=0))
print(scaled_beetles_data_plus_bias.mean(axis=1))
print(scaled_beetles_data_plus_bias.mean())
print(scaled_beetles_data_plus_bias.shape)


[  1.00000000e+00   1.44113467e-17   2.28102995e-17   4.98732999e-18
   2.83359195e-16   8.08013168e-17  -1.85536561e-17  -1.69069830e-17
   1.23967019e-17   6.42202516e-17   1.07421437e-17  -1.12126217e-17
  -3.29308340e-17   5.78806258e-17  -1.67766159e-16  -1.71056877e-16
  -6.25683217e-18  -1.96326015e-17  -1.63931368e-17  -1.97482498e-17
   2.14793592e-17  -5.75520797e-17  -2.50720110e-17   1.39382403e-17
  -2.86728764e-17  -1.80043270e-18  -3.49113100e-17  -3.48088036e-17
  -8.13348757e-18  -1.03623444e-16   6.87154195e-17   2.41494535e-17
  -1.89978504e-17  -2.49169372e-18   4.20900425e-17  -6.82239145e-17
   1.34717048e-17   4.23837627e-17   2.40035790e-18   1.11771388e-17
  -2.00544547e-17  -2.84355839e-17  -8.24387906e-18   1.22665977e-17
   9.27577670e-17  -1.09497848e-17  -3.20398169e-18  -4.08514237e-17
   1.95392944e-17   1.15372253e-17   4.21984627e-17]
[ 0.00944422 -0.05181852  0.01036125 ...,  0.01019833  0.00864407
 -0.00143986]
0.0196078431373
(2640, 51)

Manually computing the gradients


In [37]:
reset_graph()

n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_beetles_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(beetles_target.reshape(-1, 1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
gradients = 2/m * tf.matmul(tf.transpose(X), error)
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
        sess.run(training_op)
    
    best_theta = theta.eval()


Epoch 0 MSE = 12.8535
Epoch 100 MSE = 0.579791
Epoch 200 MSE = 0.375034
Epoch 300 MSE = 0.32213
Epoch 400 MSE = 0.298218
Epoch 500 MSE = 0.286128
Epoch 600 MSE = 0.279253
Epoch 700 MSE = 0.274795
Epoch 800 MSE = 0.271541
Epoch 900 MSE = 0.268953

In [10]:
best_theta


Out[10]:
array([[  4.22480881e-01],
       [ -6.81840675e-03],
       [ -7.18841935e-03],
       [ -7.18382653e-03],
       [ -6.62305392e-03],
       [ -9.18873213e-03],
       [ -5.46728307e-03],
       [ -5.95652312e-03],
       [ -5.77153685e-03],
       [ -1.37990864e-03],
       [ -9.06051509e-03],
       [ -1.22608151e-02],
       [ -5.89096872e-03],
       [ -7.20357941e-03],
       [  8.57072230e-03],
       [ -8.15803651e-03],
       [ -8.87605641e-03],
       [ -6.58698799e-03],
       [ -6.99260877e-03],
       [ -8.76311678e-03],
       [ -1.02930712e-02],
       [  8.15941859e-03],
       [  1.11372005e-02],
       [ -6.16845815e-03],
       [ -7.52900541e-03],
       [ -6.45099906e-03],
       [ -3.28587741e-03],
       [ -5.49288420e-03],
       [ -5.50642004e-03],
       [ -5.01797581e-03],
       [ -7.06271548e-03],
       [  9.97629762e-03],
       [ -7.21516181e-03],
       [ -8.00648006e-04],
       [ -7.02090515e-03],
       [ -3.95306991e-03],
       [  1.31232254e-02],
       [  1.16427941e-02],
       [ -8.59191827e-03],
       [ -7.97636621e-03],
       [ -2.08461052e-03],
       [ -3.34648602e-03],
       [ -9.00009065e-04],
       [ -3.98133561e-05],
       [ -5.09485323e-03],
       [  4.31457534e-03],
       [  7.83401448e-03],
       [ -8.05191044e-03],
       [ -1.13943489e-02],
       [ -1.00133196e-02],
       [ -5.46636758e-03]], dtype=float32)

Using autodiff

Same as above except for the gradients = ... line:


In [38]:
reset_graph()

n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_beetles_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(beetles_target.reshape(-1, 1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")

In [39]:
gradients = tf.gradients(mse, [theta])[0]

In [40]:
training_op = tf.assign(theta, theta - learning_rate * gradients)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
        sess.run(training_op)
    
    best_theta = theta.eval()

print("Best theta:")
print(best_theta)


Epoch 0 MSE = 12.8535
Epoch 100 MSE = 0.579791
Epoch 200 MSE = 0.375034
Epoch 300 MSE = 0.32213
Epoch 400 MSE = 0.298218
Epoch 500 MSE = 0.286128
Epoch 600 MSE = 0.279253
Epoch 700 MSE = 0.274795
Epoch 800 MSE = 0.271541
Epoch 900 MSE = 0.268953
Best theta:
[[  4.26893234e-01]
 [ -2.28237305e-02]
 [ -1.20465066e-02]
 [ -3.78935397e-01]
 [  3.96565974e-01]
 [  2.97544897e-01]
 [ -2.43776321e-01]
 [  6.18292503e-02]
 [  5.91007322e-02]
 [ -7.35064030e-01]
 [ -7.10461855e-01]
 [  1.35596134e-02]
 [  3.06796078e-02]
 [ -2.23657303e-02]
 [  2.99071218e-03]
 [  8.17752182e-02]
 [  7.50491070e-03]
 [  4.50154766e-02]
 [  1.41548859e-02]
 [ -6.25077039e-02]
 [  7.52414539e-02]
 [  9.49930921e-02]
 [  2.69836914e-02]
 [  2.43379492e-02]
 [ -1.47435442e-02]
 [ -4.31828480e-03]
 [ -2.09847029e-04]
 [  2.46132957e-03]
 [ -6.62515231e-04]
 [  3.91870784e-03]
 [  2.71130595e-02]
 [  4.51824479e-02]
 [  8.01797658e-02]
 [  8.75337124e-02]
 [  4.24106270e-01]
 [ -2.83008605e-01]
 [  4.84760195e-01]
 [  4.62454528e-01]
 [ -8.76409352e-01]
 [  6.55641317e-01]
 [ -5.45519665e-02]
 [ -4.06774849e-01]
 [ -4.15017843e-01]
 [  2.57657379e-01]
 [ -8.99346054e-01]
 [ -1.02125391e-01]
 [  4.14734542e-01]
 [  1.83808282e-01]
 [ -1.62522629e-01]
 [ -2.17793003e-01]
 [ -2.13521086e-02]]

In [ ]:

How could you find the partial derivatives of the following function with regards to a and b?


In [14]:
def my_func(a, b):
    z = 0
    for i in range(100):
        z = a * np.cos(z + i) + z * np.sin(b - i)
    return z

In [15]:
my_func(0.2, 0.3)


Out[15]:
-0.21253923284754914

In [16]:
reset_graph()

a = tf.Variable(0.2, name="a")
b = tf.Variable(0.3, name="b")
z = tf.constant(0.0, name="z0")
for i in range(100):
    z = a * tf.cos(z + i) + z * tf.sin(b - i)

grads = tf.gradients(z, [a, b])
init = tf.global_variables_initializer()

Let's compute the function at $a=0.2$ and $b=0.3$, and the partial derivatives at that point with regards to $a$ and with regards to $b$:


In [17]:
with tf.Session() as sess:
    init.run()
    print(z.eval())
    print(sess.run(grads))


-0.212537
[-1.1388494, 0.19671395]

Using a GradientDescentOptimizer


In [18]:
reset_graph()

n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_beetles_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(beetles_target.reshape(-1, 1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")

In [19]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

In [20]:
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())
        sess.run(training_op)
    
    best_theta = theta.eval()

print("Best theta:")
print(best_theta)


Epoch 0 MSE = 17.9485
Epoch 100 MSE = 0.552651
Epoch 200 MSE = 0.246694
Epoch 300 MSE = 0.241313
Epoch 400 MSE = 0.241218
Epoch 500 MSE = 0.241217
Epoch 600 MSE = 0.241217
Epoch 700 MSE = 0.241217
Epoch 800 MSE = 0.241217
Epoch 900 MSE = 0.241217
Best theta:
[[  4.22480881e-01]
 [ -6.81840722e-03]
 [ -7.18841935e-03]
 [ -7.18382653e-03]
 [ -6.62305346e-03]
 [ -9.18873306e-03]
 [ -5.46728261e-03]
 [ -5.95652265e-03]
 [ -5.77153638e-03]
 [ -1.37990911e-03]
 [ -9.06051416e-03]
 [ -1.22608142e-02]
 [ -5.89096872e-03]
 [ -7.20358035e-03]
 [  8.57072137e-03]
 [ -8.15803558e-03]
 [ -8.87605734e-03]
 [ -6.58698799e-03]
 [ -6.99260831e-03]
 [ -8.76312051e-03]
 [ -1.02930693e-02]
 [  8.15941766e-03]
 [  1.11372024e-02]
 [ -6.16845768e-03]
 [ -7.52900494e-03]
 [ -6.45099953e-03]
 [ -3.28587787e-03]
 [ -5.49288606e-03]
 [ -5.50642004e-03]
 [ -5.01797581e-03]
 [ -7.06271501e-03]
 [  9.97629669e-03]
 [ -7.21516134e-03]
 [ -8.00647656e-04]
 [ -7.02090422e-03]
 [ -3.95307038e-03]
 [  1.31232264e-02]
 [  1.16427941e-02]
 [ -8.59191734e-03]
 [ -7.97636621e-03]
 [ -2.08461098e-03]
 [ -3.34648578e-03]
 [ -9.00008832e-04]
 [ -3.98137163e-05]
 [ -5.09485276e-03]
 [  4.31457488e-03]
 [  7.83401355e-03]
 [ -8.05190951e-03]
 [ -1.13943489e-02]
 [ -1.00133205e-02]
 [ -5.46636852e-03]]

Using a momentum optimizer


In [21]:
reset_graph()

n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_beetles_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(beetles_target.reshape(-1, 1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")

In [22]:
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                       momentum=0.9)

In [23]:
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

In [24]:
with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        sess.run(training_op)
    
    best_theta = theta.eval()

print("Best theta:")
print(best_theta)


Best theta:
[[  4.22481656e-01]
 [ -6.81839697e-03]
 [ -7.18840957e-03]
 [ -7.18381535e-03]
 [ -6.62306556e-03]
 [ -9.18875262e-03]
 [ -5.46729518e-03]
 [ -5.95651148e-03]
 [ -5.77154849e-03]
 [ -1.37990562e-03]
 [ -9.06049274e-03]
 [ -1.22608319e-02]
 [ -5.89095755e-03]
 [ -7.20356964e-03]
 [  8.57074745e-03]
 [ -8.15805607e-03]
 [ -8.87603499e-03]
 [ -6.58697449e-03]
 [ -6.99259760e-03]
 [ -8.76314007e-03]
 [ -1.02930907e-02]
 [  8.15943908e-03]
 [  1.11372229e-02]
 [ -6.16846932e-03]
 [ -7.52899423e-03]
 [ -6.45100977e-03]
 [ -3.28588253e-03]
 [ -5.49287582e-03]
 [ -5.50640933e-03]
 [ -5.01796464e-03]
 [ -7.06272572e-03]
 [  9.97631811e-03]
 [ -7.21515017e-03]
 [ -8.00645736e-04]
 [ -7.02091539e-03]
 [ -3.95306014e-03]
 [  1.31232049e-02]
 [  1.16427736e-02]
 [ -8.59189685e-03]
 [ -7.97638670e-03]
 [ -2.08461564e-03]
 [ -3.34648066e-03]
 [ -9.00008075e-04]
 [ -3.98120937e-05]
 [ -5.09484159e-03]
 [  4.31458512e-03]
 [  7.83399306e-03]
 [ -8.05193000e-03]
 [ -1.13943275e-02]
 [ -1.00133019e-02]
 [ -5.46635734e-03]]

Feeding data to the training algorithm

Placeholder nodes


In [25]:
reset_graph()

A = tf.placeholder(tf.float32, shape=(None, 3))
B = A + 5
with tf.Session() as sess:
    B_val_1 = B.eval(feed_dict={A: [[1, 2, 3]]})
    B_val_2 = B.eval(feed_dict={A: [[4, 5, 6], [7, 8, 9]]})

print(B_val_1)


[[ 6.  7.  8.]]

In [26]:
print(B_val_2)


[[  9.  10.  11.]
 [ 12.  13.  14.]]

Mini-batch Gradient Descent


In [27]:
n_epochs = 1000
learning_rate = 0.01

In [28]:
reset_graph()

X = tf.placeholder(tf.float32, shape=(None, n + 1), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")

In [29]:
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

In [30]:
n_epochs = 10

In [31]:
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

In [32]:
def fetch_batch(epoch, batch_index, batch_size):
    np.random.seed(epoch * n_batches + batch_index)  # not shown in the book
    indices = np.random.randint(m, size=batch_size)  # not shown
    X_batch = scaled_beetles_data_plus_bias[indices] # not shown
    y_batch = beetles_target.reshape(-1, 1)[indices] # not shown
    return X_batch, y_batch

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})

    best_theta = theta.eval()

In [33]:
best_theta


Out[33]:
array([[  4.22476262e-01],
       [ -6.94966177e-03],
       [ -7.10052997e-03],
       [ -7.50745181e-03],
       [ -6.69049146e-03],
       [ -6.75913552e-03],
       [ -5.64861391e-03],
       [ -6.53533684e-03],
       [ -5.41561889e-03],
       [ -1.55830511e-03],
       [ -9.13777202e-03],
       [ -1.15519455e-02],
       [ -6.04805443e-03],
       [ -5.52704139e-03],
       [  5.76924672e-03],
       [ -8.04819260e-03],
       [ -7.98149873e-03],
       [ -8.18613265e-03],
       [ -6.24202983e-03],
       [ -7.18858466e-03],
       [ -1.06109409e-02],
       [  9.82293207e-03],
       [  1.09657319e-02],
       [ -6.04576617e-03],
       [ -7.25607015e-03],
       [ -6.23924984e-03],
       [ -3.88489431e-03],
       [ -5.51366899e-03],
       [ -6.03099214e-03],
       [ -5.72972791e-03],
       [ -9.98415519e-03],
       [  8.69318657e-03],
       [ -6.53691543e-03],
       [ -1.04342587e-03],
       [ -7.10827205e-03],
       [ -4.09286376e-03],
       [  1.31616425e-02],
       [  1.09604998e-02],
       [ -8.61782953e-03],
       [ -9.77288280e-03],
       [ -2.81576021e-03],
       [ -3.16454098e-03],
       [ -6.11963682e-04],
       [ -1.86083722e-04],
       [ -6.23905100e-03],
       [  3.79212224e-03],
       [  7.09700212e-03],
       [ -8.76564905e-03],
       [ -1.02087380e-02],
       [ -9.29476880e-03],
       [ -3.97285819e-03]], dtype=float32)

Saving and restoring a model


In [34]:
reset_graph()

n_epochs = 1000                                                                       # not shown in the book
learning_rate = 0.01                                                                  # not shown

X = tf.constant(scaled_beetles_data_plus_bias, dtype=tf.float32, name="X")            # not shown
y = tf.constant(beetles_target.reshape(-1, 1), dtype=tf.float32, name="y")            # not shown
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")                                      # not shown
error = y_pred - y                                                                    # not shown
mse = tf.reduce_mean(tf.square(error), name="mse")                                    # not shown
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)            # not shown
training_op = optimizer.minimize(mse)                                                 # not shown

init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print("Epoch", epoch, "MSE =", mse.eval())                                # not shown
            save_path = saver.save(sess, "/tmp/my_model.ckpt")
        sess.run(training_op)
    
    best_theta = theta.eval()
    save_path = saver.save(sess, "/tmp/my_model_final.ckpt")


Epoch 0 MSE = 17.9485
Epoch 100 MSE = 0.552651
Epoch 200 MSE = 0.246694
Epoch 300 MSE = 0.241313
Epoch 400 MSE = 0.241218
Epoch 500 MSE = 0.241217
Epoch 600 MSE = 0.241217
Epoch 700 MSE = 0.241217
Epoch 800 MSE = 0.241217
Epoch 900 MSE = 0.241217

In [35]:
best_theta


Out[35]:
array([[  4.22480881e-01],
       [ -6.81840722e-03],
       [ -7.18841935e-03],
       [ -7.18382653e-03],
       [ -6.62305346e-03],
       [ -9.18873306e-03],
       [ -5.46728261e-03],
       [ -5.95652265e-03],
       [ -5.77153638e-03],
       [ -1.37990911e-03],
       [ -9.06051416e-03],
       [ -1.22608142e-02],
       [ -5.89096872e-03],
       [ -7.20358035e-03],
       [  8.57072137e-03],
       [ -8.15803558e-03],
       [ -8.87605734e-03],
       [ -6.58698799e-03],
       [ -6.99260831e-03],
       [ -8.76312051e-03],
       [ -1.02930693e-02],
       [  8.15941766e-03],
       [  1.11372024e-02],
       [ -6.16845768e-03],
       [ -7.52900494e-03],
       [ -6.45099953e-03],
       [ -3.28587787e-03],
       [ -5.49288606e-03],
       [ -5.50642004e-03],
       [ -5.01797581e-03],
       [ -7.06271501e-03],
       [  9.97629669e-03],
       [ -7.21516134e-03],
       [ -8.00647656e-04],
       [ -7.02090422e-03],
       [ -3.95307038e-03],
       [  1.31232264e-02],
       [  1.16427941e-02],
       [ -8.59191734e-03],
       [ -7.97636621e-03],
       [ -2.08461098e-03],
       [ -3.34648578e-03],
       [ -9.00008832e-04],
       [ -3.98137163e-05],
       [ -5.09485276e-03],
       [  4.31457488e-03],
       [  7.83401355e-03],
       [ -8.05190951e-03],
       [ -1.13943489e-02],
       [ -1.00133205e-02],
       [ -5.46636852e-03]], dtype=float32)

In [36]:
with tf.Session() as sess:
    saver.restore(sess, "/tmp/my_model_final.ckpt")
    best_theta_restored = theta.eval() # not shown in the book


INFO:tensorflow:Restoring parameters from /tmp/my_model_final.ckpt

In [37]:
np.allclose(best_theta, best_theta_restored)


Out[37]:
True

If you want to have a saver that loads and restores theta with a different name, such as "weights":


In [38]:
saver = tf.train.Saver({"weights": theta})

By default the saver also saves the graph structure itself in a second file with the extension .meta. You can use the function tf.train.import_meta_graph() to restore the graph structure. This function loads the graph into the default graph and returns a Saver that can then be used to restore the graph state (i.e., the variable values):


In [39]:
reset_graph()
# notice that we start with an empty graph.

saver = tf.train.import_meta_graph("/tmp/my_model_final.ckpt.meta")  # this loads the graph structure
theta = tf.get_default_graph().get_tensor_by_name("theta:0") # not shown in the book

with tf.Session() as sess:
    saver.restore(sess, "/tmp/my_model_final.ckpt")  # this restores the graph's state
    best_theta_restored = theta.eval() # not shown in the book


INFO:tensorflow:Restoring parameters from /tmp/my_model_final.ckpt

In [40]:
np.allclose(best_theta, best_theta_restored)


Out[40]:
True

This means that you can import a pretrained model without having to have the corresponding Python code to build the graph. This is very handy when you keep tweaking and saving your model: you can load a previously saved model without having to search for the version of the code that built it.

Visualizing the graph

inside Jupyter


In [41]:
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = b"<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [42]:
show_graph(tf.get_default_graph())


Using TensorBoard


In [43]:
reset_graph()

from datetime import datetime

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

In [44]:
n_epochs = 1000
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape=(None, n + 1), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

In [45]:
mse_summary = tf.summary.scalar('MSE', mse)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [46]:
n_epochs = 10
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

In [47]:
with tf.Session() as sess:                                                        # not shown in the book
    sess.run(init)                                                                # not shown

    for epoch in range(n_epochs):                                                 # not shown
        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            if batch_index % 10 == 0:
                summary_str = mse_summary.eval(feed_dict={X: X_batch, y: y_batch})
                step = epoch * n_batches + batch_index
                file_writer.add_summary(summary_str, step)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})

    best_theta = theta.eval()                                                     # not shown

In [48]:
file_writer.close()

In [49]:
best_theta


Out[49]:
array([[  4.22476262e-01],
       [ -6.94966177e-03],
       [ -7.10052997e-03],
       [ -7.50745181e-03],
       [ -6.69049146e-03],
       [ -6.75913552e-03],
       [ -5.64861391e-03],
       [ -6.53533684e-03],
       [ -5.41561889e-03],
       [ -1.55830511e-03],
       [ -9.13777202e-03],
       [ -1.15519455e-02],
       [ -6.04805443e-03],
       [ -5.52704139e-03],
       [  5.76924672e-03],
       [ -8.04819260e-03],
       [ -7.98149873e-03],
       [ -8.18613265e-03],
       [ -6.24202983e-03],
       [ -7.18858466e-03],
       [ -1.06109409e-02],
       [  9.82293207e-03],
       [  1.09657319e-02],
       [ -6.04576617e-03],
       [ -7.25607015e-03],
       [ -6.23924984e-03],
       [ -3.88489431e-03],
       [ -5.51366899e-03],
       [ -6.03099214e-03],
       [ -5.72972791e-03],
       [ -9.98415519e-03],
       [  8.69318657e-03],
       [ -6.53691543e-03],
       [ -1.04342587e-03],
       [ -7.10827205e-03],
       [ -4.09286376e-03],
       [  1.31616425e-02],
       [  1.09604998e-02],
       [ -8.61782953e-03],
       [ -9.77288280e-03],
       [ -2.81576021e-03],
       [ -3.16454098e-03],
       [ -6.11963682e-04],
       [ -1.86083722e-04],
       [ -6.23905100e-03],
       [  3.79212224e-03],
       [  7.09700212e-03],
       [ -8.76564905e-03],
       [ -1.02087380e-02],
       [ -9.29476880e-03],
       [ -3.97285819e-03]], dtype=float32)

Name scopes


In [50]:
reset_graph()

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

n_epochs = 1000
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape=(None, n + 1), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")
theta = tf.Variable(tf.random_uniform([n + 1, 1], -1.0, 1.0, seed=42), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")

In [51]:
with tf.name_scope("loss") as scope:
    error = y_pred - y
    mse = tf.reduce_mean(tf.square(error), name="mse")

In [52]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()

mse_summary = tf.summary.scalar('MSE', mse)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [53]:
n_epochs = 10
batch_size = 100
n_batches = int(np.ceil(m / batch_size))

with tf.Session() as sess:
    sess.run(init)

    for epoch in range(n_epochs):
        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            if batch_index % 10 == 0:
                summary_str = mse_summary.eval(feed_dict={X: X_batch, y: y_batch})
                step = epoch * n_batches + batch_index
                file_writer.add_summary(summary_str, step)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})

    best_theta = theta.eval()

file_writer.flush()
file_writer.close()
print("Best theta:")
print(best_theta)


Best theta:
[[  4.22476262e-01]
 [ -6.94966177e-03]
 [ -7.10052997e-03]
 [ -7.50745181e-03]
 [ -6.69049146e-03]
 [ -6.75913552e-03]
 [ -5.64861391e-03]
 [ -6.53533684e-03]
 [ -5.41561889e-03]
 [ -1.55830511e-03]
 [ -9.13777202e-03]
 [ -1.15519455e-02]
 [ -6.04805443e-03]
 [ -5.52704139e-03]
 [  5.76924672e-03]
 [ -8.04819260e-03]
 [ -7.98149873e-03]
 [ -8.18613265e-03]
 [ -6.24202983e-03]
 [ -7.18858466e-03]
 [ -1.06109409e-02]
 [  9.82293207e-03]
 [  1.09657319e-02]
 [ -6.04576617e-03]
 [ -7.25607015e-03]
 [ -6.23924984e-03]
 [ -3.88489431e-03]
 [ -5.51366899e-03]
 [ -6.03099214e-03]
 [ -5.72972791e-03]
 [ -9.98415519e-03]
 [  8.69318657e-03]
 [ -6.53691543e-03]
 [ -1.04342587e-03]
 [ -7.10827205e-03]
 [ -4.09286376e-03]
 [  1.31616425e-02]
 [  1.09604998e-02]
 [ -8.61782953e-03]
 [ -9.77288280e-03]
 [ -2.81576021e-03]
 [ -3.16454098e-03]
 [ -6.11963682e-04]
 [ -1.86083722e-04]
 [ -6.23905100e-03]
 [  3.79212224e-03]
 [  7.09700212e-03]
 [ -8.76564905e-03]
 [ -1.02087380e-02]
 [ -9.29476880e-03]
 [ -3.97285819e-03]]

In [54]:
print(error.op.name)


loss/sub

In [55]:
print(mse.op.name)


loss/mse

In [56]:
reset_graph()

a1 = tf.Variable(0, name="a")      # name == "a"
a2 = tf.Variable(0, name="a")      # name == "a_1"

with tf.name_scope("param"):       # name == "param"
    a3 = tf.Variable(0, name="a")  # name == "param/a"

with tf.name_scope("param"):       # name == "param_1"
    a4 = tf.Variable(0, name="a")  # name == "param_1/a"

for node in (a1, a2, a3, a4):
    print(node.op.name)


a
a_1
param/a
param_1/a

Modularity

An ugly flat code:


In [57]:
reset_graph()

n_features = 3
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")

w1 = tf.Variable(tf.random_normal((n_features, 1)), name="weights1")
w2 = tf.Variable(tf.random_normal((n_features, 1)), name="weights2")
b1 = tf.Variable(0.0, name="bias1")
b2 = tf.Variable(0.0, name="bias2")

z1 = tf.add(tf.matmul(X, w1), b1, name="z1")
z2 = tf.add(tf.matmul(X, w2), b2, name="z2")

relu1 = tf.maximum(z1, 0., name="relu1")
relu2 = tf.maximum(z1, 0., name="relu2")  # Oops, cut&paste error! Did you spot it?

output = tf.add(relu1, relu2, name="output")

Much better, using a function to build the ReLUs:


In [58]:
reset_graph()

def relu(X):
    w_shape = (int(X.get_shape()[1]), 1)
    w = tf.Variable(tf.random_normal(w_shape), name="weights")
    b = tf.Variable(0.0, name="bias")
    z = tf.add(tf.matmul(X, w), b, name="z")
    return tf.maximum(z, 0., name="relu")

n_features = 3
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
relus = [relu(X) for i in range(5)]
output = tf.add_n(relus, name="output")

In [59]:
file_writer = tf.summary.FileWriter("logs/relu1", tf.get_default_graph())

Even better using name scopes:


In [60]:
reset_graph()

def relu(X):
    with tf.name_scope("relu"):
        w_shape = (int(X.get_shape()[1]), 1)                          # not shown in the book
        w = tf.Variable(tf.random_normal(w_shape), name="weights")    # not shown
        b = tf.Variable(0.0, name="bias")                             # not shown
        z = tf.add(tf.matmul(X, w), b, name="z")                      # not shown
        return tf.maximum(z, 0., name="max")                          # not shown

In [61]:
n_features = 3
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
relus = [relu(X) for i in range(5)]
output = tf.add_n(relus, name="output")

file_writer = tf.summary.FileWriter("logs/relu2", tf.get_default_graph())
file_writer.close()

Sharing Variables

Sharing a threshold variable the classic way, by defining it outside of the relu() function then passing it as a parameter:


In [62]:
reset_graph()

def relu(X, threshold):
    with tf.name_scope("relu"):
        w_shape = (int(X.get_shape()[1]), 1)                        # not shown in the book
        w = tf.Variable(tf.random_normal(w_shape), name="weights")  # not shown
        b = tf.Variable(0.0, name="bias")                           # not shown
        z = tf.add(tf.matmul(X, w), b, name="z")                    # not shown
        return tf.maximum(z, threshold, name="max")

threshold = tf.Variable(0.0, name="threshold")
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
relus = [relu(X, threshold) for i in range(5)]
output = tf.add_n(relus, name="output")

In [63]:
reset_graph()

def relu(X):
    with tf.name_scope("relu"):
        if not hasattr(relu, "threshold"):
            relu.threshold = tf.Variable(0.0, name="threshold")
        w_shape = int(X.get_shape()[1]), 1                          # not shown in the book
        w = tf.Variable(tf.random_normal(w_shape), name="weights")  # not shown
        b = tf.Variable(0.0, name="bias")                           # not shown
        z = tf.add(tf.matmul(X, w), b, name="z")                    # not shown
        return tf.maximum(z, relu.threshold, name="max")

In [64]:
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
relus = [relu(X) for i in range(5)]
output = tf.add_n(relus, name="output")

In [65]:
reset_graph()

with tf.variable_scope("relu"):
    threshold = tf.get_variable("threshold", shape=(),
                                initializer=tf.constant_initializer(0.0))

In [66]:
with tf.variable_scope("relu", reuse=True):
    threshold = tf.get_variable("threshold")

In [67]:
with tf.variable_scope("relu") as scope:
    scope.reuse_variables()
    threshold = tf.get_variable("threshold")

In [68]:
reset_graph()

def relu(X):
    with tf.variable_scope("relu", reuse=True):
        threshold = tf.get_variable("threshold")
        w_shape = int(X.get_shape()[1]), 1                          # not shown
        w = tf.Variable(tf.random_normal(w_shape), name="weights")  # not shown
        b = tf.Variable(0.0, name="bias")                           # not shown
        z = tf.add(tf.matmul(X, w), b, name="z")                    # not shown
        return tf.maximum(z, threshold, name="max")

X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
with tf.variable_scope("relu"):
    threshold = tf.get_variable("threshold", shape=(),
                                initializer=tf.constant_initializer(0.0))
relus = [relu(X) for relu_index in range(5)]
output = tf.add_n(relus, name="output")

In [69]:
file_writer = tf.summary.FileWriter("logs/relu6", tf.get_default_graph())
file_writer.close()

In [70]:
reset_graph()

def relu(X):
    with tf.variable_scope("relu"):
        threshold = tf.get_variable("threshold", shape=(), initializer=tf.constant_initializer(0.0))
        w_shape = (int(X.get_shape()[1]), 1)
        w = tf.Variable(tf.random_normal(w_shape), name="weights")
        b = tf.Variable(0.0, name="bias")
        z = tf.add(tf.matmul(X, w), b, name="z")
        return tf.maximum(z, threshold, name="max")

X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
with tf.variable_scope("", default_name="") as scope:
    first_relu = relu(X)     # create the shared variable
    scope.reuse_variables()  # then reuse it
    relus = [first_relu] + [relu(X) for i in range(4)]
output = tf.add_n(relus, name="output")

file_writer = tf.summary.FileWriter("logs/relu8", tf.get_default_graph())
file_writer.close()

In [71]:
reset_graph()

def relu(X):
    threshold = tf.get_variable("threshold", shape=(),
                                initializer=tf.constant_initializer(0.0))
    w_shape = (int(X.get_shape()[1]), 1)                        # not shown in the book
    w = tf.Variable(tf.random_normal(w_shape), name="weights")  # not shown
    b = tf.Variable(0.0, name="bias")                           # not shown
    z = tf.add(tf.matmul(X, w), b, name="z")                    # not shown
    return tf.maximum(z, threshold, name="max")

X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
relus = []
for relu_index in range(5):
    with tf.variable_scope("relu", reuse=(relu_index >= 1)) as scope:
        relus.append(relu(X))
output = tf.add_n(relus, name="output")

In [72]:
file_writer = tf.summary.FileWriter("logs/relu9", tf.get_default_graph())
file_writer.close()

Extra material


In [73]:
reset_graph()

with tf.variable_scope("my_scope"):
    x0 = tf.get_variable("x", shape=(), initializer=tf.constant_initializer(0.))
    x1 = tf.Variable(0., name="x")
    x2 = tf.Variable(0., name="x")

with tf.variable_scope("my_scope", reuse=True):
    x3 = tf.get_variable("x")
    x4 = tf.Variable(0., name="x")

with tf.variable_scope("", default_name="", reuse=True):
    x5 = tf.get_variable("my_scope/x")

print("x0:", x0.op.name)
print("x1:", x1.op.name)
print("x2:", x2.op.name)
print("x3:", x3.op.name)
print("x4:", x4.op.name)
print("x5:", x5.op.name)
print(x0 is x3 and x3 is x5)


x0: my_scope/x
x1: my_scope/x_1
x2: my_scope/x_2
x3: my_scope/x
x4: my_scope_1/x
x5: my_scope/x
True

The first variable_scope() block first creates the shared variable x0, named my_scope/x. For all operations other than shared variables (including non-shared variables), the variable scope acts like a regular name scope, which is why the two variables x1 and x2 have a name with a prefix my_scope/. Note however that TensorFlow makes their names unique by adding an index: my_scope/x_1 and my_scope/x_2.

The second variable_scope() block reuses the shared variables in scope my_scope, which is why x0 is x3. Once again, for all operations other than shared variables it acts as a named scope, and since it's a separate block from the first one, the name of the scope is made unique by TensorFlow (my_scope_1) and thus the variable x4 is named my_scope_1/x.

The third block shows another way to get a handle on the shared variable my_scope/x by creating a variable_scope() at the root scope (whose name is an empty string), then calling get_variable() with the full name of the shared variable (i.e. "my_scope/x").

Strings


In [74]:
reset_graph()

text = np.array("Do you want some café?".split())
text_tensor = tf.constant(text)

with tf.Session() as sess:
    print(text_tensor.eval())


[b'Do' b'you' b'want' b'some' b'caf\xc3\xa9?']

Implementing a Home-Made Computation Graph


In [75]:
class Const(object):
    def __init__(self, value):
        self.value = value
    def evaluate(self):
        return self.value
    def __str__(self):
        return str(self.value)

class Var(object):
    def __init__(self, init_value, name):
        self.value = init_value
        self.name = name
    def evaluate(self):
        return self.value
    def __str__(self):
        return self.name

class BinaryOperator(object):
    def __init__(self, a, b):
        self.a = a
        self.b = b

class Add(BinaryOperator):
    def evaluate(self):
        return self.a.evaluate() + self.b.evaluate()
    def __str__(self):
        return "{} + {}".format(self.a, self.b)

class Mul(BinaryOperator):
    def evaluate(self):
        return self.a.evaluate() * self.b.evaluate()
    def __str__(self):
        return "({}) * ({})".format(self.a, self.b)

x = Var(3, name="x")
y = Var(4, name="y")
f = Add(Mul(Mul(x, x), y), Add(y, Const(2))) # f(x,y) = x²y + y + 2
print("f(x,y) =", f)
print("f(3,4) =", f.evaluate())


f(x,y) = ((x) * (x)) * (y) + y + 2
f(3,4) = 42

Computing gradients

Mathematical differentiation


In [76]:
df_dx = Mul(Const(2), Mul(x, y))  # df/dx = 2xy
df_dy = Add(Mul(x, x), Const(1))  # df/dy = x² + 1
print("df/dx(3,4) =", df_dx.evaluate())
print("df/dy(3,4) =", df_dy.evaluate())


df/dx(3,4) = 24
df/dy(3,4) = 10

Numerical differentiation


In [77]:
def gradients(func, vars_list, eps=0.0001):
    partial_derivatives = []
    base_func_eval = func.evaluate()
    for var in vars_list:
        original_value = var.value
        var.value = var.value + eps
        tweaked_func_eval = func.evaluate()
        var.value = original_value
        derivative = (tweaked_func_eval - base_func_eval) / eps
        partial_derivatives.append(derivative)
    return partial_derivatives

df_dx, df_dy = gradients(f, [x, y])
print("df/dx(3,4) =", df_dx)
print("df/dy(3,4) =", df_dy)


df/dx(3,4) = 24.000400000048216
df/dy(3,4) = 10.000000000047748

Symbolic differentiation


In [78]:
Const.derive = lambda self, var: Const(0)
Var.derive = lambda self, var: Const(1) if self is var else Const(0)
Add.derive = lambda self, var: Add(self.a.derive(var), self.b.derive(var))
Mul.derive = lambda self, var: Add(Mul(self.a, self.b.derive(var)), Mul(self.a.derive(var), self.b))

x = Var(3.0, name="x")
y = Var(4.0, name="y")
f = Add(Mul(Mul(x, x), y), Add(y, Const(2))) # f(x,y) = x²y + y + 2

df_dx = f.derive(x)  # 2xy
df_dy = f.derive(y)  # x² + 1
print("df/dx(3,4) =", df_dx.evaluate())
print("df/dy(3,4) =", df_dy.evaluate())


df/dx(3,4) = 24.0
df/dy(3,4) = 10.0

Automatic differentiation (autodiff) – forward mode


In [79]:
class DualNumber(object):
    def __init__(self, value=0.0, eps=0.0):
        self.value = value
        self.eps = eps
    def __add__(self, b):
        return DualNumber(self.value + self.to_dual(b).value,
                          self.eps + self.to_dual(b).eps)
    def __radd__(self, a):
        return self.to_dual(a).__add__(self)
    def __mul__(self, b):
        return DualNumber(self.value * self.to_dual(b).value,
                          self.eps * self.to_dual(b).value + self.value * self.to_dual(b).eps)
    def __rmul__(self, a):
        return self.to_dual(a).__mul__(self)
    def __str__(self):
        if self.eps:
            return "{:.1f} + {:.1f}ε".format(self.value, self.eps)
        else:
            return "{:.1f}".format(self.value)
    def __repr__(self):
        return str(self)
    @classmethod
    def to_dual(cls, n):
        if hasattr(n, "value"):
            return n
        else:
            return cls(n)

$3 + (3 + 4 \epsilon) = 6 + 4\epsilon$


In [80]:
3 + DualNumber(3, 4)


Out[80]:
6.0 + 4.0ε

$(3 + 4ε)\times(5 + 7ε) = 3 \times 5 + 3 \times 7ε + 4ε \times 5 + 4ε \times 7ε = 15 + 21ε + 20ε + 28ε^2 = 15 + 41ε + 28 \times 0 = 15 + 41ε$


In [81]:
DualNumber(3, 4) * DualNumber(5, 7)


Out[81]:
15.0 + 41.0ε

In [82]:
x.value = DualNumber(3.0)
y.value = DualNumber(4.0)

f.evaluate()


Out[82]:
42.0

In [83]:
x.value = DualNumber(3.0, 1.0)  # 3 + ε
y.value = DualNumber(4.0)       # 4

df_dx = f.evaluate().eps

x.value = DualNumber(3.0)       # 3
y.value = DualNumber(4.0, 1.0)  # 4 + ε

df_dy = f.evaluate().eps

In [84]:
df_dx


Out[84]:
24.0

In [85]:
df_dy


Out[85]:
10.0

Autodiff – Reverse mode


In [86]:
class Const(object):
    def __init__(self, value):
        self.value = value
    def evaluate(self):
        return self.value
    def backpropagate(self, gradient):
        pass
    def __str__(self):
        return str(self.value)

class Var(object):
    def __init__(self, init_value, name):
        self.value = init_value
        self.name = name
        self.gradient = 0
    def evaluate(self):
        return self.value
    def backpropagate(self, gradient):
        self.gradient += gradient
    def __str__(self):
        return self.name

class BinaryOperator(object):
    def __init__(self, a, b):
        self.a = a
        self.b = b

class Add(BinaryOperator):
    def evaluate(self):
        self.value = self.a.evaluate() + self.b.evaluate()
        return self.value
    def backpropagate(self, gradient):
        self.a.backpropagate(gradient)
        self.b.backpropagate(gradient)
    def __str__(self):
        return "{} + {}".format(self.a, self.b)

class Mul(BinaryOperator):
    def evaluate(self):
        self.value = self.a.evaluate() * self.b.evaluate()
        return self.value
    def backpropagate(self, gradient):
        self.a.backpropagate(gradient * self.b.value)
        self.b.backpropagate(gradient * self.a.value)
    def __str__(self):
        return "({}) * ({})".format(self.a, self.b)

x = Var(3, name="x")
y = Var(4, name="y")
f = Add(Mul(Mul(x, x), y), Add(y, Const(2))) # f(x,y) = x²y + y + 2

result = f.evaluate()
f.backpropagate(1.0)

print("f(x,y) =", f)
print("f(3,4) =", result)
print("df_dx =", x.gradient)
print("df_dy =", y.gradient)


f(x,y) = ((x) * (x)) * (y) + y + 2
f(3,4) = 42
df_dx = 24.0
df_dy = 10.0

Autodiff – reverse mode (using TensorFlow)


In [87]:
reset_graph()

x = tf.Variable(3., name="x")
y = tf.Variable(4., name="y")
f = x*x*y + y + 2

gradients = tf.gradients(f, [x, y])

init = tf.global_variables_initializer()

with tf.Session() as sess:
    init.run()
    f_val, gradients_val = sess.run([f, gradients])

f_val, gradients_val


Out[87]:
(42.0, [24.0, 10.0])

In [ ]: