In [ ]:
import tensorflow as tf
import numpy as np
import pandas as pd

Catigorical Columns Don't just One-Hot, They Count.


In [ ]:
tf.reset_default_graph()

Here we have 3 examples, each containing 5 strings.

Note: empty strings are ignored, and can be used as padding


In [ ]:
strings = np.array([['a','a','','b','c'],['a','c','zz','',''],['b','qq','qq','b','']])

Now we define a categorical column to represent it.


In [ ]:
vocab = np.array(['a','b','c','<UNK>'],dtype=object)
sparse = tf.feature_column.categorical_column_with_vocabulary_list('strings',vocab, default_value=len(vocab)-1)

Use indicator_column to define a dense representation, and input_layer to build the conversion operations.


In [ ]:
layer = tf.feature_column.input_layer({'strings':strings}, [tf.feature_column.indicator_column(sparse)])

In [ ]:
with tf.train.MonitoredSession() as sess:
    input_value = sess.run(layer)

In [ ]:
pd.DataFrame(input_value, columns=vocab)

Embedding Columns reduce Over Entries, Using the combiner


In [ ]:
tf.reset_default_graph()

In [ ]:
layer = tf.feature_column.input_layer(
    {'strings':strings},
    [tf.feature_column.embedding_column(sparse,10, combiner='mean')])

In [ ]:
init = tf.global_variables_initializer()
with tf.train.MonitoredSession() as sess:
    sess.run(init)
    input_value = sess.run(layer)

The result is 3, 10d embeddings, because it takes the mean over the strings in each example.


In [ ]:
input_value.shape

In [ ]:
input_value

To Skip the reduce?

Use sequence_input_from_feature_columns

Note: this is only compatible with contrib feature_columns


In [ ]:
tf.reset_default_graph()

In [ ]:
vocab

In [ ]:
sparse = tf.contrib.layers.sparse_column_with_keys('strings', vocab)

In [ ]:
layer = tf.contrib.layers.sequence_input_from_feature_columns(
    {'strings':tf.constant(strings)},
    [tf.contrib.layers.embedding_column(sparse ,10)])

If we run this, it gives a 10d embedding for each of the 5 strings in each of the 3 examples


In [ ]:
init = tf.global_variables_initializer()
with tf.train.MonitoredSession() as sess:
    sess.run(init)
    input_value = sess.run(layer)

In [ ]:
input_value.shape

Or some careful reshaping


In [ ]:
tf.reset_default_graph()

In [ ]:
shape = tf.shape(strings) 
embedding_dim = 10

layer = tf.feature_column.input_layer(
    {'strings':tf.reshape(strings,[tf.reduce_prod(shape)])},
    [tf.feature_column.embedding_column(sparse,embedding_dim, combiner='mean')])

layer = tf.reshape(layer,tf.concat([shape, [embedding_dim]],0))

In [ ]:
init = tf.global_variables_initializer()
with tf.train.MonitoredSession() as sess:
    sess.run(init)
    input_value = sess.run(layer)

In [ ]:
input_value.shape

Linear Bag of Words Model


In [3]:
import tensorflow as tf
from tensorflow.contrib import keras as keras
import numpy as np


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-3-14b4e6114f6c> in <module>()
      1 import tensorflow as tf
----> 2 from tensorflow.contrib import keras as keras
      3 import numpy as np

/usr/local/lib/python3.4/dist-packages/tensorflow/contrib/__init__.py in <module>()
     20 
     21 # Add projects here, they will show up under tf.contrib.
---> 22 from tensorflow.contrib import bayesflow
     23 from tensorflow.contrib import cloud
     24 from tensorflow.contrib import compiler

ImportError: cannot import name 'bayesflow'

In [ ]:
tf.reset_default_graph()

Load the IMDB dataset


In [ ]:
NUM_WORDS=1000 # only use top 1000 words
MAX_LEN=250    # truncate after 250 words
INDEX_FROM=3   # word index offset

In [ ]:
train,test = keras.datasets.imdb.load_data(maxlen=MAX_LEN, num_words=NUM_WORDS, index_from=INDEX_FROM)
train_x,train_y = train
test_x,test_y = test

Look at an example review

(Punctuation and capitalization are stripped)


In [ ]:
word_to_id = keras.datasets.imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in train_x[0] ))

Create the Input Function


In [ ]:
def get_input_fn(x_in, y_in, shuffle=True, epochs=1):
    def input_fn():
        ys = tf.contrib.data.Dataset.from_tensor_slices(y_in)
        
        # Convert x_in to a sparse tensor
        nested_sparse = [
            (np.array([[n]*len(x),range(len(x))]).T,x)
            for n,x in enumerate(x_in)
        ]
        
        indices = np.concatenate([idx for idx,value in nested_sparse], axis = 0)
        values = np.concatenate([value for idx,value in nested_sparse], axis = 0)
        
        max_len = max(len(ex) for ex in x_in)
        xs = tf.SparseTensor(indices = indices, values = values, dense_shape=[25000, max_len])
        
        xs = tf.contrib.data.Dataset.from_sparse_tensor_slices(xs)
        
        xs = xs.map(lambda *x: tf.sparse_tensor_to_dense(tf.SparseTensor(*x)))
        
        ds = tf.contrib.data.Dataset.zip([xs,ys]).repeat(epochs)
        
        if shuffle:
            ds = ds.shuffle(10000)
            
        ds = ds.batch(32)

        x,y = ds.make_one_shot_iterator().get_next()

        return {'word_ids':x},y
        
    return input_fn

Test the input function


In [ ]:
in_fn = get_input_fn(x_in = np.array([[1,1,1],[2,2],[3,3,3],[4,4,4,4],[5],[6,6],[7,7,7,7,7]]), 
                     y_in = np.array([1,2,3,4,5,6,7]))

x,y = in_fn()

In [ ]:
init = tf.global_variables_initializer()

with tf.train.MonitoredSession() as sess:
    sess.run(init)
    x,y = sess.run([x,y])

Note the zero padding, and that x and y have the same shuffle applied.


In [ ]:
x['word_ids']

In [ ]:
y[:,None]

Build the Estimator


In [ ]:
word_ids = tf.feature_column.categorical_column_with_identity('word_ids', NUM_WORDS)

In [ ]:
bow_estimator = tf.contrib.learn.LinearClassifier(feature_columns=[word_ids], model_dir='tensorboard/BOW')

In [ ]:
for n in range(25):
    bow_estimator.fit(input_fn=get_input_fn(train_x,train_y,epochs=10))
    bow_estimator.evaluate(input_fn=get_input_fn(test_x, test_y,epochs=1))

DNN Bag of words?


In [ ]:
tf.reset_default_graph()

In [ ]:
DNN_bow_estimator = tf.contrib.learn.DNNClassifier(
    [256, 256],  model_dir='tensorboard/DNN_BOW',
    feature_columns=[tf.feature_column.embedding_column(word_ids, 30, combiner='mean')])

In [ ]:
for n in range(25):
    DNN_bow_estimator.fit(input_fn=get_input_fn(train_x,train_y,epochs=10))
    DNN_bow_estimator.evaluate(input_fn=get_input_fn(test_x,test_y,epochs=1))