Examples of use tf.data and tf.estimator with the iris dataset


In [2]:
import tensorflow as tf

In [26]:
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"

CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth',
                    'PetalLength', 'PetalWidth', 'Species']

train_path = tf.keras.utils.get_file(fname=TRAIN_URL.split('/')[-1],
                                         origin=TRAIN_URL, cache_dir='/tmp')

test_path = tf.keras.utils.get_file(fname=TEST_URL.split('/')[-1],
                                         origin=TEST_URL, cache_dir='/tmp')
train_path


Downloading data from http://download.tensorflow.org/data/iris_training.csv
8192/2194 [================================================================================================================] - 0s
Downloading data from http://download.tensorflow.org/data/iris_test.csv
8192/573 [============================================================================================================================================================================================================================================================================================================================================================================================================================================] - 0s
Out[26]:
'/tmp/datasets/iris_training.csv'

In [23]:
_CSV_COLUMNS = ['SepalLength', 'SepalWidth',
                    'PetalLength', 'PetalWidth', 'Species']

_CSV_COLUMN_DEFAULTS = [[0], [0], [0], [0], [0]]
                        

def input_fn(data_file, num_epochs, shuffle, batch_size):

  def parse_csv(value):
    print('Parsing', data_file)
    columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS)
    features = dict(zip(_CSV_COLUMNS, columns))
    labels = features.pop('Species')
    return features, labels

  # Extract lines from input files using the Dataset API.
  dataset = tf.data.TextLineDataset(data_file)

  if shuffle:
    dataset = dataset.shuffle(buffer_size=100)

  dataset = dataset.map(parse_csv, num_parallel_calls=5)

  # We call repeat after shuffling, rather than before, to prevent separate
  # epochs from blending together.
  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(batch_size)
  return dataset

In [25]:
input_fn(train_path, 2, True, 5)


Parsing /home/jorge/.keras/datasets/iris_training.csv
Out[25]:
<BatchDataset shapes: ({PetalWidth: (?,), SepalWidth: (?,), PetalLength: (?,), SepalLength: (?,)}, (?,)), types: ({PetalWidth: tf.int32, SepalWidth: tf.int32, PetalLength: tf.int32, SepalLength: tf.int32}, tf.int32)>

In [24]:
for n in range(5):
    model.train(input_fn=lambda: input_fn(
        train_path, 2, True, 5))


Parsing /home/jorge/.keras/datasets/iris_training.csv
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-24-49deb9779651> in <module>()
      1 for n in range(5):
----> 2     model.train(input_fn=lambda: input_fn(
      3         train_path, 2, True, 5))

~/anaconda3/envs/tf14/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
    300 
    301     saving_listeners = _check_listeners_type(saving_listeners)
--> 302     loss = self._train_model(input_fn, hooks, saving_listeners)
    303     logging.info('Loss for final step: %s.', loss)
    304     return self

~/anaconda3/envs/tf14/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
    709       with ops.control_dependencies([global_step_read_tensor]):
    710         estimator_spec = self._call_model_fn(
--> 711             features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
    712       # Check if the user created a loss summary, and add one if they didn't.
    713       # We assume here that the summary is called 'loss'. If it is not, we will

~/anaconda3/envs/tf14/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py in _call_model_fn(self, features, labels, mode, config)
    692     if 'config' in model_fn_args:
    693       kwargs['config'] = config
--> 694     model_fn_results = self._model_fn(features=features, **kwargs)
    695 
    696     if not isinstance(model_fn_results, model_fn_lib.EstimatorSpec):

~/anaconda3/envs/tf14/lib/python3.5/site-packages/tensorflow/python/estimator/canned/linear.py in _model_fn(features, labels, mode, config)
    251           optimizer=optimizer,
    252           partitioner=partitioner,
--> 253           config=config)
    254     super(LinearClassifier, self).__init__(
    255         model_fn=_model_fn,

~/anaconda3/envs/tf14/lib/python3.5/site-packages/tensorflow/python/estimator/canned/linear.py in _linear_model_fn(features, labels, mode, head, feature_columns, optimizer, partitioner, config)
     98   if not isinstance(features, dict):
     99     raise ValueError('features should be a dictionary of `Tensor`s. '
--> 100                      'Given type: {}'.format(type(features)))
    101   optimizer = optimizers.get_optimizer_instance(
    102       optimizer or _get_default_optimizer(feature_columns),

ValueError: features should be a dictionary of `Tensor`s. Given type: <class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>

In [ ]:


In [ ]:


In [4]:
train_path


Out[4]:
'/home/jorge/.keras/datasets/iris_training.csv'

In [14]:
dataset = tf.data.TextLineDataset(train_path)
dataset = dataset.shuffle(100)

def parse_csv(value):
columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS)
    features = dict(zip(_CSV_COLUMNS, columns))
    labels = features.pop('Species')
    return features, labels

dataset = dataset.map(parse_csv, num_parallel_calls=5)
dataset = dataset.repeat(2)
dataset = dataset.batch(10)

In [22]:
parse_csv('/home/jorge/.keras/datasets/iris_training.csv')


Out[22]:
({'PetalLength': <tf.Tensor 'DecodeCSV:2' shape=() dtype=int32>,
  'PetalWidth': <tf.Tensor 'DecodeCSV:3' shape=() dtype=int32>,
  'SepalLength': <tf.Tensor 'DecodeCSV:0' shape=() dtype=int32>,
  'SepalWidth': <tf.Tensor 'DecodeCSV:1' shape=() dtype=int32>},
 <tf.Tensor 'DecodeCSV:4' shape=() dtype=int32>)

In [17]:
wide_columns = [
    tf.feature_column.numeric_column('SepalLength'),
    tf.feature_column.numeric_column('SepalWidth'),
    tf.feature_column.numeric_column('PetalLength'),
    tf.feature_column.numeric_column('PetalLength')
]

In [19]:
model = tf.estimator.LinearClassifier(
        model_dir='tmp/model',
        feature_columns=wide_columns)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_service': None, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_is_chief': True, '_model_dir': 'tmp/model', '_save_checkpoints_steps': None, '_master': '', '_task_id': 0, '_tf_random_seed': None, '_session_config': None, '_save_summary_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9ca2329ac8>, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_num_worker_replicas': 1, '_task_type': 'worker'}

In [21]:
for n in range(5):
    model.train(input_fn=lambda: dataset)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-f3d0aec955be> in <module>()
      1 for n in range(5):
----> 2     model.train(input_fn=lambda: dataset)

~/anaconda3/envs/tf14/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
    300 
    301     saving_listeners = _check_listeners_type(saving_listeners)
--> 302     loss = self._train_model(input_fn, hooks, saving_listeners)
    303     logging.info('Loss for final step: %s.', loss)
    304     return self

~/anaconda3/envs/tf14/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
    709       with ops.control_dependencies([global_step_read_tensor]):
    710         estimator_spec = self._call_model_fn(
--> 711             features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
    712       # Check if the user created a loss summary, and add one if they didn't.
    713       # We assume here that the summary is called 'loss'. If it is not, we will

~/anaconda3/envs/tf14/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py in _call_model_fn(self, features, labels, mode, config)
    692     if 'config' in model_fn_args:
    693       kwargs['config'] = config
--> 694     model_fn_results = self._model_fn(features=features, **kwargs)
    695 
    696     if not isinstance(model_fn_results, model_fn_lib.EstimatorSpec):

~/anaconda3/envs/tf14/lib/python3.5/site-packages/tensorflow/python/estimator/canned/linear.py in _model_fn(features, labels, mode, config)
    251           optimizer=optimizer,
    252           partitioner=partitioner,
--> 253           config=config)
    254     super(LinearClassifier, self).__init__(
    255         model_fn=_model_fn,

~/anaconda3/envs/tf14/lib/python3.5/site-packages/tensorflow/python/estimator/canned/linear.py in _linear_model_fn(features, labels, mode, head, feature_columns, optimizer, partitioner, config)
     98   if not isinstance(features, dict):
     99     raise ValueError('features should be a dictionary of `Tensor`s. '
--> 100                      'Given type: {}'.format(type(features)))
    101   optimizer = optimizers.get_optimizer_instance(
    102       optimizer or _get_default_optimizer(feature_columns),

ValueError: features should be a dictionary of `Tensor`s. Given type: <class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>

In [ ]:


In [ ]: