Implementing File Readers

Queues and Coordinators


In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
## List of filenames from which data is to be read
filenames = ["heart.csv"]

## Creating a FIFO Queue under the hood using tf.train.string_input_producer
filename_queue = tf.train.string_input_producer(filenames)

## Creating a TextLineReader File Reader which reads a single line of text
reader = tf.TextLineReader(skip_header_lines=1) # skip the first header line

## Reading the string from filename_queue using the reader object
key, value = reader.read(filename_queue)

In [18]:
N_FEATURES = 9

## Value will be a string and unfit to feed it to the model
## We convert the string to a vector using tf.decode_csv
record_defaults = [[1.0] for _ in range(N_FEATURES)]
record_defaults[4] = [''] # 4th feature is a string
record_defaults.append([1]) # Label is an integer

## Using tf.decode_csv to decode string acc. to format provided by record_defaults
content = tf.decode_csv(value, record_defaults=record_defaults)

In [19]:
## Converting the 5th column to binary
condition = tf.equal(content[4],tf.constant('Present'))
content[4] = tf.where(condition, tf.constant(1.0), tf.constant(0.0))

## Packing all 9 features into a tensor
features = tf.stack(content[:N_FEATURES])
label = content[-1]

In [20]:
## Data needs to be in batches rather than being in single line form
BATCH_SIZE = 20
min_after_dequeue = 10*BATCH_SIZE

## Maximum capacity of the queue
capacity = 20*BATCH_SIZE

In [25]:
with tf.Session() as sess:
    ## Coordinator to handle multiple threads
    coord = tf.train.Coordinator()
    
    ## Starting the threads using coordinator object
    threads = tf.train.start_queue_runners(coord=coord)
    
    ## Printing the key, value pairs of data generated
    print '='*3,"Printing key/value pairs",'='*3
    print sess.run(key)
    print sess.run(value)
    
    ## Printing the processed values from the above
    print '='*3,"Printing processed feautures/label pairs",'='*3
    print sess.run(features)
    print sess.run(label)
    
    ## Generating the BATCH_SIZE amount of data
    data_batch, label_batch = tf.train.shuffle_batch([features, label], batch_size=BATCH_SIZE,
                                                    capacity=capacity, min_after_dequeue=min_after_dequeue)
    
    print '='*3,"Printing batches of features and their corresponding labels","="*3
    print data_batch.shape
    print label_batch.shape
    
    coord.request_stop()
    coord.join(threads)


=== Printing key/value pairs ===
heart.csv:2
118,0.08,3.48,32.28,Present,52,29.14,3.81,46,0
=== Printing processed feautures/label pairs ===
[ 142.            4.05000019    3.38000011   16.20000076    0.           59.
   20.80999947    2.61999989   38.        ]
0
=== Printing batches of features and their corresponding labels ===
(20, 9)
(20,)

In [ ]: