In [ ]:

    
import tensorflow as tf
import numpy as np
import pandas as pd

Catigorical Columns Don't just One-Hot, They Count.



In [ ]:

    
tf.reset_default_graph()

Here we have 3 examples, each containing 5 strings.

Note: empty strings are ignored, and can be used as padding



In [ ]:

    
strings = np.array([['a','a','','b','c'],['a','c','zz','',''],['b','qq','qq','b','']])

Now we define a categorical column to represent it.



In [ ]:

    
vocab = np.array(['a','b','c','<UNK>'],dtype=object)
sparse = tf.feature_column.categorical_column_with_vocabulary_list('strings',vocab, default_value=len(vocab)-1)

Use indicator_column to define a dense representation, and input_layer to build the conversion operations.



In [ ]:

    
layer = tf.feature_column.input_layer({'strings':strings}, [tf.feature_column.indicator_column(sparse)])



In [ ]:

    
with tf.train.MonitoredSession() as sess:
    input_value = sess.run(layer)



In [ ]:

    
pd.DataFrame(input_value, columns=vocab)

Embedding Columns `reduce` Over Entries, Using the `combiner`



In [ ]:

    
tf.reset_default_graph()



In [ ]:

    
layer = tf.feature_column.input_layer(
    {'strings':strings},
    [tf.feature_column.embedding_column(sparse,10, combiner='mean')])



In [ ]:

    
init = tf.global_variables_initializer()
with tf.train.MonitoredSession() as sess:
    sess.run(init)
    input_value = sess.run(layer)

The result is 3, 10d embeddings, because it takes the mean over the strings in each example.



In [ ]:

    
input_value.shape



In [ ]:

    
input_value

To Skip the `reduce`?

Use `sequence_input_from_feature_columns`

Note: this is only compatible with contrib feature_columns



In [ ]:

    
tf.reset_default_graph()



In [ ]:

    
vocab



In [ ]:

    
sparse = tf.contrib.layers.sparse_column_with_keys('strings', vocab)



In [ ]:

    
layer = tf.contrib.layers.sequence_input_from_feature_columns(
    {'strings':tf.constant(strings)},
    [tf.contrib.layers.embedding_column(sparse ,10)])

If we run this, it gives a 10d embedding for each of the 5 strings in each of the 3 examples



In [ ]:

    
init = tf.global_variables_initializer()
with tf.train.MonitoredSession() as sess:
    sess.run(init)
    input_value = sess.run(layer)



In [ ]:

    
input_value.shape

Or some careful reshaping



In [ ]:

    
tf.reset_default_graph()



In [ ]:

    
shape = tf.shape(strings) 
embedding_dim = 10

layer = tf.feature_column.input_layer(
    {'strings':tf.reshape(strings,[tf.reduce_prod(shape)])},
    [tf.feature_column.embedding_column(sparse,embedding_dim, combiner='mean')])

layer = tf.reshape(layer,tf.concat([shape, [embedding_dim]],0))



In [ ]:

    
init = tf.global_variables_initializer()
with tf.train.MonitoredSession() as sess:
    sess.run(init)
    input_value = sess.run(layer)



In [ ]:

    
input_value.shape

Linear Bag of Words Model



In [3]:

    
import tensorflow as tf
from tensorflow.contrib import keras as keras
import numpy as np









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-3-14b4e6114f6c> in <module>()
      1 import tensorflow as tf
----> 2 from tensorflow.contrib import keras as keras
      3 import numpy as np

/usr/local/lib/python3.4/dist-packages/tensorflow/contrib/__init__.py in <module>()
     20 
     21 # Add projects here, they will show up under tf.contrib.
---> 22 from tensorflow.contrib import bayesflow
     23 from tensorflow.contrib import cloud
     24 from tensorflow.contrib import compiler

ImportError: cannot import name 'bayesflow'



In [ ]:

    
tf.reset_default_graph()

Load the IMDB dataset



In [ ]:

    
NUM_WORDS=1000 # only use top 1000 words
MAX_LEN=250    # truncate after 250 words
INDEX_FROM=3   # word index offset



In [ ]:

    
train,test = keras.datasets.imdb.load_data(maxlen=MAX_LEN, num_words=NUM_WORDS, index_from=INDEX_FROM)
train_x,train_y = train
test_x,test_y = test

Look at an example review

(Punctuation and capitalization are stripped)



In [ ]:

    
word_to_id = keras.datasets.imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in train_x[0] ))

Create the Input Function



In [ ]:

    
def get_input_fn(x_in, y_in, shuffle=True, epochs=1):
    def input_fn():
        ys = tf.contrib.data.Dataset.from_tensor_slices(y_in)
        
        # Convert x_in to a sparse tensor
        nested_sparse = [
            (np.array([[n]*len(x),range(len(x))]).T,x)
            for n,x in enumerate(x_in)
        ]
        
        indices = np.concatenate([idx for idx,value in nested_sparse], axis = 0)
        values = np.concatenate([value for idx,value in nested_sparse], axis = 0)
        
        max_len = max(len(ex) for ex in x_in)
        xs = tf.SparseTensor(indices = indices, values = values, dense_shape=[25000, max_len])
        
        xs = tf.contrib.data.Dataset.from_sparse_tensor_slices(xs)
        
        xs = xs.map(lambda *x: tf.sparse_tensor_to_dense(tf.SparseTensor(*x)))
        
        ds = tf.contrib.data.Dataset.zip([xs,ys]).repeat(epochs)
        
        if shuffle:
            ds = ds.shuffle(10000)
            
        ds = ds.batch(32)

        x,y = ds.make_one_shot_iterator().get_next()

        return {'word_ids':x},y
        
    return input_fn

Test the input function



In [ ]:

    
in_fn = get_input_fn(x_in = np.array([[1,1,1],[2,2],[3,3,3],[4,4,4,4],[5],[6,6],[7,7,7,7,7]]), 
                     y_in = np.array([1,2,3,4,5,6,7]))

x,y = in_fn()



In [ ]:

    
init = tf.global_variables_initializer()

with tf.train.MonitoredSession() as sess:
    sess.run(init)
    x,y = sess.run([x,y])

Note the zero padding, and that x and y have the same shuffle applied.



In [ ]:

    
x['word_ids']



In [ ]:

    
y[:,None]

Build the Estimator



In [ ]:

    
word_ids = tf.feature_column.categorical_column_with_identity('word_ids', NUM_WORDS)



In [ ]:

    
bow_estimator = tf.contrib.learn.LinearClassifier(feature_columns=[word_ids], model_dir='tensorboard/BOW')



In [ ]:

    
for n in range(25):
    bow_estimator.fit(input_fn=get_input_fn(train_x,train_y,epochs=10))
    bow_estimator.evaluate(input_fn=get_input_fn(test_x, test_y,epochs=1))

DNN Bag of words?



In [ ]:

    
tf.reset_default_graph()



In [ ]:

    
DNN_bow_estimator = tf.contrib.learn.DNNClassifier(
    [256, 256],  model_dir='tensorboard/DNN_BOW',
    feature_columns=[tf.feature_column.embedding_column(word_ids, 30, combiner='mean')])



In [ ]:

    
for n in range(25):
    DNN_bow_estimator.fit(input_fn=get_input_fn(train_x,train_y,epochs=10))
    DNN_bow_estimator.evaluate(input_fn=get_input_fn(test_x,test_y,epochs=1))