In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.datasets import make_classification

Create and save a random dataset


In [8]:
dataset_created = make_classification(n_samples=100, n_features=5)

In [9]:
if not os.path.exists("testds"):
    os.mkdir("testds")
pd.DataFrame(dataset_created[0]).to_csv("testds/X.csv", index=False)
pd.DataFrame(dataset_created[1]).to_csv("testds/Y.csv", index=False)

Start to play with tensorflow: build the graph


In [38]:
tf.reset_default_graph()
filename_queueX = tf.train.string_input_producer(["testds/X.csv"])
filename_queueY = tf.train.string_input_producer(["testds/Y.csv"])

readerX = tf.TextLineReader( skip_header_lines=1)
key, valueX = readerX.read(filename_queueX)

readerY = tf.TextLineReader( skip_header_lines=1)
key, valueY = readerY.read(filename_queueY)

# Default values, in case of empty columns. Also specifies the type of the
# decoded result.
record_defaults = [[1.], [1.], [1.], [1.], [1.]]
col1, col2, col3, col4, col5 = tf.decode_csv(valueX, record_defaults=record_defaults)
features = tf.stack([col1, col2, col3, col4, col5])
target = tf.decode_csv(valueY, record_defaults=[1])

Start a session and have a look at what it does


In [ ]:
sess = tf.InteractiveSession()

# Start populating the filename queue.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)

for i in range(1200):
    # Retrieve a single instance:
    example, label = sess.run([features, target])
    print("example : {}".format(example))
    print("label : {} | {}".format(label, dataset[1][i] ))
coord.request_stop()
coord.join(threads)
sess.close()

With Datasets ?


In [2]:
def read_rowX(csv_row):
    record_defaults = [[0.0] for _ in range(5)]
    row = tf.decode_csv(csv_row, record_defaults=record_defaults)
    return row
def read_rowY(csv_row):
    record_defaults = [[0]]
    row = tf.decode_csv(csv_row, record_defaults=record_defaults)
    return row

In [18]:
tf.reset_default_graph()
dataX = tf.contrib.data.TextLineDataset(["testds/X.csv"]).skip(1).map(lambda line: read_rowX(line))
dataY = tf.contrib.data.TextLineDataset(["testds/Y.csv"]).skip(1).map(lambda line: read_rowY(line))

dataset = tf.contrib.data.Dataset.zip((dataX, dataY))
dataset = dataset.repeat(-1)
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(4)
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()

Start a session and have a look at what it does


In [19]:
sess = tf.InteractiveSession()
sess.run(iterator.initializer)

# Start populating the filename queue.
# coord = tf.train.Coordinator()
# threads = tf.train.start_queue_runners(coord=coord)

for i in range(5):
    # Retrieve a single instance:
    x_, y_ = sess.run(next_element)
    print("x_ : {}".format(x_,dataset_created[0][(4*i):(4*(i+1)),:]))
    print("y_ : {}".format(y_,dataset_created[1][(4*i):(4*(i+1))]))
#     print("x_ : {} | {}".format(x_,dataset_created[0][(4*i):(4*(i+1)),:]))
#     print("y_ : {} | {}".format(y_,dataset_created[1][(4*i):(4*(i+1))]))
#     print("label : {} | {}".format(label, dataset_created[1][i] ))
coord.request_stop()
coord.join(threads)
sess.close()


x_ : [[ 0.62039006  1.28688061  0.64067596 -1.37342143 -0.85903567]
 [ 0.68092704 -1.66005695 -0.04232123 -0.89476609  0.71628618]
 [-0.415335   -0.9808777   0.85739839  0.94326776  0.63954777]
 [ 0.80425245 -1.87975717  0.4975239  -1.07296443  0.80229783]]
y_ : [[1]
 [0]
 [0]
 [0]]
x_ : [[ 0.57320696 -0.11811064  0.78913075 -1.008322   -0.08786258]
 [-0.32951188 -0.83351856 -1.22482932  0.75938696  0.53726977]
 [ 0.61345357 -0.02652777  0.55074894 -1.09903502 -0.14796428]
 [ 0.63816696 -0.37510505 -0.13243483 -1.07401562  0.03372843]]
y_ : [[0]
 [0]
 [0]
 [1]]
x_ : [[-1.5106796   1.69215465  0.70041883  2.38206697 -0.5141077 ]
 [-0.58247679 -1.05631042 -0.56845391  1.25919414  0.72449857]
 [-0.997747   -1.4155184   1.31619883  2.07838154  1.02832949]
 [ 0.35767436  1.5591954   2.29348445 -0.95478755 -0.93658334]]
y_ : [[1]
 [0]
 [0]
 [1]]
x_ : [[-0.96823376 -1.31148112 -1.29235435  2.00450706  0.96434194]
 [-0.415335   -0.9808777   0.85739839  0.94326776  0.63954777]
 [ 0.28717077 -0.40139347  0.47839844 -0.43691844  0.14078017]
 [-0.11041786  0.79478014 -0.60194492  0.04028892 -0.3999677 ]]
y_ : [[0]
 [0]
 [1]
 [1]]
x_ : [[ 0.65248853 -1.58604765 -0.78037888 -0.85832953  0.68384486]
 [-1.18799794 -1.43703687 -0.97685611  2.42515779  1.09028029]
 [-0.415335   -0.9808777   0.85739839  0.94326776  0.63954777]
 [ 0.57320696 -0.11811064  0.78913075 -1.008322   -0.08786258]]
y_ : [[0]
 [0]
 [0]
 [0]]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-786e62628e8f> in <module>()
     14 #     print("y_ : {} | {}".format(y_,dataset_created[1][(4*i):(4*(i+1))]))
     15 #     print("label : {} | {}".format(label, dataset_created[1][i] ))
---> 16 coord.request_stop()
     17 coord.join(threads)
     18 sess.close()

NameError: name 'coord' is not defined