Reading Data


In [1]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [70]:
from datetime import date
date.today()


Out[70]:
datetime.date(2017, 3, 2)

In [71]:
author = "kyubyong. https://github.com/Kyubyong/tensorflow-exercises"

In [72]:
tf.__version__


Out[72]:
'1.0.0'

In [73]:
np.__version__


Out[73]:
'1.12.0'

NOTE on notation

_x, _y, _z, _X, _Y, _Z, ...: NumPy arrays
x, y, z, X, Y, Z, ...: Tensors

Placeholder


In [74]:
# Make data and save to npz.
_x = np.zeros((100, 10), np.int32)
for i in range(100):
    _x[i] = np.random.permutation(10)
_x, _y = _x[:, :-1], _x[:, -1]

import os
if not os.path.exists('example'): os.mkdir('example')
np.savez('example/example.npz', _x=_x, _y=_y)

In [75]:
# Load data
data = np.load('example/example.npz')
_x, _y = data["_x"], data["_y"]

#Q1. Make a placeholder for x such that it should be of dtype=int32, shape=(None, 9).
# Inputs and targets
x_pl = ...
y_hat = 45 - tf.reduce_sum(x_pl, axis=1) # We find a digit x_pl doesn't contain.

# Session
with tf.Session() as sess:
    _y_hat = sess.run(y_hat, {x_pl: _x})
    print("y_hat =", _y_hat[:30])
    print("true y =", _y[:30])


y_hat = [8 9 3 9 1 3 5 9 2 2 2 3 1 1 9 6 8 4 9 0 4 8 2 3 7 2 2 5 8 8]
true y = [8 9 3 9 1 3 5 9 2 2 2 3 1 1 9 6 8 4 9 0 4 8 2 3 7 2 2 5 8 8]

TFRecord


In [76]:
tf.reset_default_graph()

# Load data
data = np.load('example/example.npz')
_x, _y = data["_x"], data["_y"]

# Serialize
with tf.python_io.TFRecordWriter("example/tfrecord") as fout:
    for _xx, _yy in zip(_x, _y):
        ex = tf.train.Example()
        
        # Q2. Add each value to ex.
        ex.features.feature['x']....
        ex.features.feature['y']....
        fout.write(ex.SerializeToString())

def read_and_decode_single_example(fname):
    # Create a string queue
    fname_q = tf.train.string_input_producer([fname], num_epochs=1, shuffle=True)
    
    # Q3. Create a TFRecordReader
    reader = ...
    
    # Read the string queue
    _, serialized_example = reader.read(fname_q)
    
    # Q4. Describe parsing syntax
    features = tf.parse_single_example(
        serialized_example,
        features={...
                  ...}
        )
    # Output
    x = features['x']
    y = features['y']
    
    return x, y

# Ops
x, y = read_and_decode_single_example('example/tfrecord')
y_hat = 45 - tf.reduce_sum(x)

# Session
with tf.Session() as sess:
    #Q5. Initialize local variables
    sess.run(...)
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    try:
        while not coord.should_stop():
            _y, _y_hat = sess.run([y, y_hat])
            print(_y[0],"==", _y_hat, end="; ")
    
    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()
    
    # Wait for threads to finish.
    coord.join(threads)


8 == 8; 9 == 9; 3 == 3; 9 == 9; 1 == 1; 3 == 3; 5 == 5; 9 == 9; 2 == 2; 2 == 2; 2 == 2; 3 == 3; 1 == 1; 1 == 1; 9 == 9; 6 == 6; 8 == 8; 4 == 4; 9 == 9; 0 == 0; 4 == 4; 8 == 8; 2 == 2; 3 == 3; 7 == 7; 2 == 2; 2 == 2; 5 == 5; 8 == 8; 8 == 8; 0 == 0; 3 == 3; 3 == 3; 9 == 9; 1 == 1; 5 == 5; 1 == 1; 3 == 3; 3 == 3; 7 == 7; 9 == 9; 2 == 2; 4 == 4; 7 == 7; 5 == 5; 4 == 4; 2 == 2; 4 == 4; 1 == 1; 0 == 0; 3 == 3; 5 == 5; 4 == 4; 8 == 8; 7 == 7; 6 == 6; 2 == 2; 1 == 1; 2 == 2; 2 == 2; 4 == 4; 1 == 1; 1 == 1; 6 == 6; 0 == 0; 4 == 4; 5 == 5; 1 == 1; 2 == 2; 1 == 1; 0 == 0; 8 == 8; 0 == 0; 4 == 4; 5 == 5; 6 == 6; 6 == 6; 7 == 7; 5 == 5; 0 == 0; 0 == 0; 6 == 6; 9 == 9; 9 == 9; 9 == 9; 3 == 3; 9 == 9; 4 == 4; 8 == 8; 7 == 7; 9 == 9; 0 == 0; 3 == 3; 7 == 7; 4 == 4; 1 == 1; 7 == 7; 4 == 4; 9 == 9; 8 == 8; Done training -- epoch limit reached

Queue


In [77]:
tf.reset_default_graph()

# Load data
data = np.load('example/example.npz')
_x, _y = data["_x"], data["_y"]

# Hyperparams
batch_size = 10 # We will feed mini-batches of size 10.
num_epochs = 2 # We will feed data for two epochs.

# Convert to tensors
x = tf.convert_to_tensor(_x)
y = tf.convert_to_tensor(_y)

# Q6. Make slice queues
x_q, y_q = ...

# Batching
x_batch, y_batch = tf.train.batch([x_q, y_q], batch_size=batch_size)

# Targets
y_hat = 45 - tf.reduce_sum(x_batch, axis=1)

# Session
with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    
    # Q7. Make a train.Coordinator and threads.
    coord = ...
    threads = ...
    
    try:
        while not coord.should_stop():
            _y_hat, _y_batch = sess.run([y_hat, y_batch])
            print(_y_hat, "==", _y_batch)
    
    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()
    
    # Wait for threads to finish.
    coord.join(threads)


[5 9 3 1 2 9 7 7 8 6] == [5 9 3 1 2 9 7 7 8 6]
[5 5 3 7 1 6 6 5 8 4] == [5 5 3 7 1 6 6 5 8 4]
[3 1 5 6 6 4 4 9 3 8] == [3 1 5 6 6 4 4 9 3 8]
[9 3 7 1 8 0 2 4 0 8] == [9 3 7 1 8 0 2 4 0 8]
[0 8 8 0 1 4 0 2 3 2] == [0 8 8 0 1 4 0 2 3 2]
[4 4 2 3 4 3 2 9 4 1] == [4 4 2 3 4 3 2 9 4 1]
[9 3 5 9 4 7 1 2 8 2] == [9 3 5 9 4 7 1 2 8 2]
[8 2 7 0 1 0 1 1 7 3] == [8 2 7 0 1 0 1 1 7 3]
[6 4 0 0 9 9 2 1 2 5] == [6 4 0 0 9 9 2 1 2 5]
[9 9 5 9 4 7 9 1 2 3] == [9 9 5 9 4 7 9 1 2 3]
[3 4 1 3 2 9 9 7 9 0] == [3 4 1 3 2 9 9 7 9 0]
[1 8 5 2 2 1 3 9 4 4] == [1 8 5 2 2 1 3 9 4 4]
[0 4 8 4 4 7 6 2 0 8] == [0 4 8 4 4 7 6 2 0 8]
[1 1 7 8 0 4 9 7 8 9] == [1 1 7 8 0 4 9 7 8 9]
[0 4 1 5 7 1 6 9 2 2] == [0 4 1 5 7 1 6 9 2 2]
[5 7 3 2 1 5 3 8 9 8] == [5 7 3 2 1 5 3 8 9 8]
[9 5 4 5 2 6 6 0 2 1] == [9 5 4 5 2 6 6 0 2 1]
[2 9 0 4 3 1 1 4 7 6] == [2 9 0 4 3 1 1 4 7 6]
[4 2 2 5 1 9 3 7 6 0] == [4 2 2 5 1 9 3 7 6 0]
[9 3 3 3 9 5 8 3 8 0] == [9 3 3 3 9 5 8 3 8 0]
Done training -- epoch limit reached

Read csv files


In [78]:
tf.reset_default_graph()

# Load data
data = np.load('example/example.npz')
_x, _y = data["_x"], data["_y"]
_x = np.concatenate((_x, np.expand_dims(_y, axis=1)), 1)

# Write to a csv file
_x_str = np.array_str(_x)
_x_str = re.sub("[\[\]]", "", _x_str)
_x_str = re.sub("(?m)^ +", "", _x_str)
_x_str = re.sub("[ ]+", ",", _x_str)
with open('example/example.csv', 'w') as fout:
    fout.write(_x_str)
    
# Hyperparams
batch_size = 10

# Create a string queue
fname_q = tf.train.string_input_producer(["example/example.csv"])

# Q8. Create a TextLineReader
reader = ...

# Read the string queue
_, value = reader.read(fname_q)

# Q9. Decode value
record_defaults = [[0]]*10
col1, col2, col3, col4, col5, col6, col7, col8, col9, col10 = tf.decode_csv(
    ...)
x = tf.stack([col1, col2, col3, col4, col5, col6, col7, col8, col9])
y = col10

# Batching
x_batch, y_batch = tf.train.shuffle_batch(
      [x, y], batch_size=batch_size, capacity=200, min_after_dequeue=100)

# Ops
y_hat = 45 - tf.reduce_sum(x_batch, axis=1)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(num_epochs*10):
        _y_hat, _y_batch = sess.run([y_hat, y_batch])
        print(_y_hat, "==", _y_batch)

    coord.request_stop()
    coord.join(threads)


[3 5 6 1 8 4 9 9 1 2] == [3 5 6 1 8 4 9 9 1 2]
[9 7 0 9 2 5 1 3 8 0] == [9 7 0 9 2 5 1 3 8 0]
[8 4 0 2 1 2 3 7 1 1] == [8 4 0 2 1 2 3 7 1 1]
[4 2 5 9 3 9 1 1 2 8] == [4 2 5 9 3 9 1 1 2 8]
[9 0 1 1 9 9 5 4 6 0] == [9 0 1 1 9 9 5 4 6 0]
[6 5 8 5 3 8 8 4 7 2] == [6 5 8 5 3 8 8 4 7 2]
[8 2 0 0 9 3 4 3 4 5] == [8 2 0 0 9 3 4 3 4 5]
[4 5 2 6 6 5 7 2 1 6] == [4 5 2 6 6 5 7 2 1 6]
[3 7 4 8 1 1 9 1 5 7] == [3 7 4 8 1 1 9 1 5 7]
[2 0 2 4 2 1 9 2 7 8] == [2 0 2 4 2 1 9 2 7 8]
[3 7 7 7 2 0 9 4 9 3] == [3 7 7 7 2 0 9 4 9 3]
[4 1 5 8 3 1 4 9 8 2] == [4 1 5 8 3 1 4 9 8 2]
[6 5 4 7 0 3 2 5 6 8] == [6 5 4 7 0 3 2 5 6 8]
[9 8 8 0 4 3 2 3 2 6] == [9 8 8 0 4 3 2 3 2 6]
[2 8 0 4 4 9 2 3 7 6] == [2 8 0 4 4 9 2 3 7 6]
[4 1 9 4 3 1 7 4 9 0] == [4 1 9 4 3 1 7 4 9 0]
[2 3 1 7 9 9 9 0 2 8] == [2 3 1 7 9 9 9 0 2 8]
[4 1 8 2 2 1 5 2 1 2] == [4 1 8 2 2 1 5 2 1 2]
[9 4 0 5 4 7 9 4 0 7] == [9 4 0 5 4 7 9 4 0 7]
[6 0 6 5 0 3 3 5 9 7] == [6 0 6 5 0 3 3 5 9 7]

Read image files


In [3]:
tf.reset_default_graph()

# Hyperparams
batch_size = 10
num_epochs = 1

# Make fake images and save
for i in range(100):
    _x = np.random.randint(0, 256, size=(10, 10, 4))
    plt.imsave("example/image_{}.jpg".format(i), _x)

# Import jpg files
images = tf.train.match_filenames_once('example/*.jpg')

# Create a string queue
fname_q = tf.train.string_input_producer(images, num_epochs=num_epochs, shuffle=True)

# Q10. Create a WholeFileReader
reader = ...

# Read the string queue
_, value = reader.read(fname_q)

# Q11. Decode value
img = ...

# Batching
img_batch = tf.train.batch([img], shapes=([10, 10, 4]), batch_size=batch_size)

with tf.Session() as sess:
    sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    
    num_samples = 0
    try:
        while not coord.should_stop():
            sess.run(img_batch)
            num_samples += batch_size
            print(num_samples, "samples have been seen")

    except tf.errors.OutOfRangeError:
        print('Done training -- epoch limit reached')
    finally:
        coord.request_stop()

    coord.join(threads)


10 samples have been seen
20 samples have been seen
30 samples have been seen
40 samples have been seen
50 samples have been seen
60 samples have been seen
70 samples have been seen
80 samples have been seen
90 samples have been seen
100 samples have been seen
Done training -- epoch limit reached

In [ ]: