In [ ]:
import tensorflow as tf
import numpy as np
import pandas as pd
In [ ]:
tf.reset_default_graph()
Here we have 3 examples, each containing 5 strings.
Note: empty strings are ignored, and can be used as padding
In [ ]:
strings = np.array([['a','a','','b','c'],['a','c','zz','',''],['b','qq','qq','b','']])
Now we define a categorical column to represent it.
In [ ]:
vocab = np.array(['a','b','c','<UNK>'],dtype=object)
sparse = tf.feature_column.categorical_column_with_vocabulary_list('strings',vocab, default_value=len(vocab)-1)
Use indicator_column to define a dense representation, and input_layer to build the conversion operations.
In [ ]:
layer = tf.feature_column.input_layer({'strings':strings}, [tf.feature_column.indicator_column(sparse)])
In [ ]:
with tf.train.MonitoredSession() as sess:
input_value = sess.run(layer)
In [ ]:
pd.DataFrame(input_value, columns=vocab)
In [ ]:
tf.reset_default_graph()
In [ ]:
layer = tf.feature_column.input_layer(
{'strings':strings},
[tf.feature_column.embedding_column(sparse,10, combiner='mean')])
In [ ]:
init = tf.global_variables_initializer()
with tf.train.MonitoredSession() as sess:
sess.run(init)
input_value = sess.run(layer)
The result is 3, 10d embeddings, because it takes the mean over the strings in each example.
In [ ]:
input_value.shape
In [ ]:
input_value
In [ ]:
tf.reset_default_graph()
In [ ]:
vocab
In [ ]:
sparse = tf.contrib.layers.sparse_column_with_keys('strings', vocab)
In [ ]:
layer = tf.contrib.layers.sequence_input_from_feature_columns(
{'strings':tf.constant(strings)},
[tf.contrib.layers.embedding_column(sparse ,10)])
If we run this, it gives a 10d embedding for each of the 5 strings in each of the 3 examples
In [ ]:
init = tf.global_variables_initializer()
with tf.train.MonitoredSession() as sess:
sess.run(init)
input_value = sess.run(layer)
In [ ]:
input_value.shape
In [ ]:
tf.reset_default_graph()
In [ ]:
shape = tf.shape(strings)
embedding_dim = 10
layer = tf.feature_column.input_layer(
{'strings':tf.reshape(strings,[tf.reduce_prod(shape)])},
[tf.feature_column.embedding_column(sparse,embedding_dim, combiner='mean')])
layer = tf.reshape(layer,tf.concat([shape, [embedding_dim]],0))
In [ ]:
init = tf.global_variables_initializer()
with tf.train.MonitoredSession() as sess:
sess.run(init)
input_value = sess.run(layer)
In [ ]:
input_value.shape
In [3]:
import tensorflow as tf
from tensorflow.contrib import keras as keras
import numpy as np
In [ ]:
tf.reset_default_graph()
In [ ]:
NUM_WORDS=1000 # only use top 1000 words
MAX_LEN=250 # truncate after 250 words
INDEX_FROM=3 # word index offset
In [ ]:
train,test = keras.datasets.imdb.load_data(maxlen=MAX_LEN, num_words=NUM_WORDS, index_from=INDEX_FROM)
train_x,train_y = train
test_x,test_y = test
In [ ]:
word_to_id = keras.datasets.imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in train_x[0] ))
In [ ]:
def get_input_fn(x_in, y_in, shuffle=True, epochs=1):
def input_fn():
ys = tf.contrib.data.Dataset.from_tensor_slices(y_in)
# Convert x_in to a sparse tensor
nested_sparse = [
(np.array([[n]*len(x),range(len(x))]).T,x)
for n,x in enumerate(x_in)
]
indices = np.concatenate([idx for idx,value in nested_sparse], axis = 0)
values = np.concatenate([value for idx,value in nested_sparse], axis = 0)
max_len = max(len(ex) for ex in x_in)
xs = tf.SparseTensor(indices = indices, values = values, dense_shape=[25000, max_len])
xs = tf.contrib.data.Dataset.from_sparse_tensor_slices(xs)
xs = xs.map(lambda *x: tf.sparse_tensor_to_dense(tf.SparseTensor(*x)))
ds = tf.contrib.data.Dataset.zip([xs,ys]).repeat(epochs)
if shuffle:
ds = ds.shuffle(10000)
ds = ds.batch(32)
x,y = ds.make_one_shot_iterator().get_next()
return {'word_ids':x},y
return input_fn
In [ ]:
in_fn = get_input_fn(x_in = np.array([[1,1,1],[2,2],[3,3,3],[4,4,4,4],[5],[6,6],[7,7,7,7,7]]),
y_in = np.array([1,2,3,4,5,6,7]))
x,y = in_fn()
In [ ]:
init = tf.global_variables_initializer()
with tf.train.MonitoredSession() as sess:
sess.run(init)
x,y = sess.run([x,y])
Note the zero padding, and that x and y have the same shuffle applied.
In [ ]:
x['word_ids']
In [ ]:
y[:,None]
In [ ]:
word_ids = tf.feature_column.categorical_column_with_identity('word_ids', NUM_WORDS)
In [ ]:
bow_estimator = tf.contrib.learn.LinearClassifier(feature_columns=[word_ids], model_dir='tensorboard/BOW')
In [ ]:
for n in range(25):
bow_estimator.fit(input_fn=get_input_fn(train_x,train_y,epochs=10))
bow_estimator.evaluate(input_fn=get_input_fn(test_x, test_y,epochs=1))
In [ ]:
tf.reset_default_graph()
In [ ]:
DNN_bow_estimator = tf.contrib.learn.DNNClassifier(
[256, 256], model_dir='tensorboard/DNN_BOW',
feature_columns=[tf.feature_column.embedding_column(word_ids, 30, combiner='mean')])
In [ ]:
for n in range(25):
DNN_bow_estimator.fit(input_fn=get_input_fn(train_x,train_y,epochs=10))
DNN_bow_estimator.evaluate(input_fn=get_input_fn(test_x,test_y,epochs=1))