In [1]:
import tensorflow as tf
from tensorflow.python.ops import lookup_ops

Define dataset and TF constants


In [2]:
tf.reset_default_graph()

sess = tf.InteractiveSession()


table = lookup_ops.index_table_from_tensor(
    tf.constant(['PAD'] + list('aábcde'))
)

dataset = tf.contrib.data.Dataset.from_tensor_slices(
    tf.constant(["abá", "acű", "bcd"])
)
dataset = dataset.map(lambda string: tf.string_split([string], delimiter='').values)
dataset = dataset.map(lambda words: table.lookup(words))

batched = dataset.padded_batch(5, padded_shapes=(tf.TensorShape([6])))
batched_iter = batched.make_initializable_iterator()
bstart = batched_iter.get_next()

table_initializer = tf.tables_initializer()


sess.run(table_initializer)

sess.run(batched_iter.initializer)

s = sess.run(bstart)

s


Out[2]:
array([[ 1,  3, -1, -1,  0,  0],
       [ 1,  4, -1, -1,  0,  0],
       [ 3,  4,  5,  0,  0,  0]])

Read data from file

Create data file


In [3]:
with open('/tmp/toy_data.txt', 'w') as data_file:
    data_file.write("abc\tdef\n")
    data_file.write("def\tábc\n")

In [4]:
tf.reset_default_graph()

sess = tf.InteractiveSession()


table = lookup_ops.index_table_from_tensor(
    tf.constant(['PAD'] + list('aábcde'))
)

dataset = tf.contrib.data.TextLineDataset('/tmp/toy_data.txt')
dataset = dataset.map(lambda string: tf.string_split([string], delimiter='\t').values)
source = dataset.map(lambda string: string[0])
target = dataset.map(lambda string: string[1])
dataset = target
dataset = dataset.map(lambda string: tf.string_split([string], delimiter='').values)
dataset = dataset.map(lambda words: table.lookup(words))

batched = dataset.padded_batch(5, padded_shapes=(tf.TensorShape([6])))
batched_iter = batched.make_initializable_iterator()
bstart = batched_iter.get_next()

table_initializer = tf.tables_initializer()


sess.run(table_initializer)

sess.run(batched_iter.initializer)

s = sess.run(bstart)

s


Out[4]:
array([[ 5,  6, -1,  0,  0,  0],
       [-1, -1,  3,  4,  0,  0]])