In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.datasets import make_classification
Create and save a random dataset
In [8]:
dataset_created = make_classification(n_samples=100, n_features=5)
In [9]:
if not os.path.exists("testds"):
os.mkdir("testds")
pd.DataFrame(dataset_created[0]).to_csv("testds/X.csv", index=False)
pd.DataFrame(dataset_created[1]).to_csv("testds/Y.csv", index=False)
Start to play with tensorflow: build the graph
In [38]:
tf.reset_default_graph()
filename_queueX = tf.train.string_input_producer(["testds/X.csv"])
filename_queueY = tf.train.string_input_producer(["testds/Y.csv"])
readerX = tf.TextLineReader( skip_header_lines=1)
key, valueX = readerX.read(filename_queueX)
readerY = tf.TextLineReader( skip_header_lines=1)
key, valueY = readerY.read(filename_queueY)
# Default values, in case of empty columns. Also specifies the type of the
# decoded result.
record_defaults = [[1.], [1.], [1.], [1.], [1.]]
col1, col2, col3, col4, col5 = tf.decode_csv(valueX, record_defaults=record_defaults)
features = tf.stack([col1, col2, col3, col4, col5])
target = tf.decode_csv(valueY, record_defaults=[1])
Start a session and have a look at what it does
In [ ]:
sess = tf.InteractiveSession()
# Start populating the filename queue.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(1200):
# Retrieve a single instance:
example, label = sess.run([features, target])
print("example : {}".format(example))
print("label : {} | {}".format(label, dataset[1][i] ))
coord.request_stop()
coord.join(threads)
sess.close()
With Datasets ?
In [2]:
def read_rowX(csv_row):
record_defaults = [[0.0] for _ in range(5)]
row = tf.decode_csv(csv_row, record_defaults=record_defaults)
return row
def read_rowY(csv_row):
record_defaults = [[0]]
row = tf.decode_csv(csv_row, record_defaults=record_defaults)
return row
In [18]:
tf.reset_default_graph()
dataX = tf.contrib.data.TextLineDataset(["testds/X.csv"]).skip(1).map(lambda line: read_rowX(line))
dataY = tf.contrib.data.TextLineDataset(["testds/Y.csv"]).skip(1).map(lambda line: read_rowY(line))
dataset = tf.contrib.data.Dataset.zip((dataX, dataY))
dataset = dataset.repeat(-1)
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(4)
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()
Start a session and have a look at what it does
In [19]:
sess = tf.InteractiveSession()
sess.run(iterator.initializer)
# Start populating the filename queue.
# coord = tf.train.Coordinator()
# threads = tf.train.start_queue_runners(coord=coord)
for i in range(5):
# Retrieve a single instance:
x_, y_ = sess.run(next_element)
print("x_ : {}".format(x_,dataset_created[0][(4*i):(4*(i+1)),:]))
print("y_ : {}".format(y_,dataset_created[1][(4*i):(4*(i+1))]))
# print("x_ : {} | {}".format(x_,dataset_created[0][(4*i):(4*(i+1)),:]))
# print("y_ : {} | {}".format(y_,dataset_created[1][(4*i):(4*(i+1))]))
# print("label : {} | {}".format(label, dataset_created[1][i] ))
coord.request_stop()
coord.join(threads)
sess.close()