In [12]:
# https://github.com/tensorflow/tensorflow/blob/r0.11/tensorflow/examples/skflow/text_classification.py

import datetime as dt
import os
import sys

import numpy as np
import pandas
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib.learn import extract_pandas_data

In [2]:
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_bool('test_with_fake_data', False,
                         'Test the example code with fake data.')

In [3]:
MAX_DOCUMENT_LENGTH = 10
EMBEDDING_SIZE = 50
n_words = 0

In [4]:
def bag_of_words_model(x, y):
  """A bag-of-words model. Note it disregards the word order in the text."""
  target = tf.one_hot(y, 15, 1, 0)
  word_vectors = learn.ops.categorical_variable(x, n_classes=n_words,
      embedding_size=EMBEDDING_SIZE, name='words')
  features = tf.reduce_max(word_vectors, reduction_indices=1)
  prediction, loss = learn.models.logistic_regression(features, target)
  train_op = tf.contrib.layers.optimize_loss(
      loss, tf.contrib.framework.get_global_step(),
      optimizer='Adam', learning_rate=0.01)
  return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op

In [5]:
def rnn_model(x, y):
  """Recurrent neural network model to predict from sequence of words
  to a class."""
  # Convert indexes of words into embeddings.
  # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
  # maps word indexes of the sequence into [batch_size, sequence_length,
  # EMBEDDING_SIZE].
  word_vectors = learn.ops.categorical_variable(x, n_classes=n_words,
      embedding_size=EMBEDDING_SIZE, name='words')

  # Split into list of embedding per word, while removing doc length dim.
  # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
  word_list = tf.unpack(word_vectors, axis=1)

  # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
  cell = tf.nn.rnn_cell.GRUCell(EMBEDDING_SIZE)

  # Create an unrolled Recurrent Neural Networks to length of
  # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
  _, encoding = tf.nn.rnn(cell, word_list, dtype=tf.float32)

  # Given encoding of RNN, take encoding of last step (e.g hidden size of the
  # neural network of last step) and pass it as features for logistic
  # regression over output classes.
  target = tf.one_hot(y, 15, 1, 0)
  prediction, loss = learn.models.logistic_regression(encoding, target)

  # Create a training op.
  train_op = tf.contrib.layers.optimize_loss(
      loss, tf.contrib.framework.get_global_step(),
      optimizer='Adam', learning_rate=0.01)

  return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op

In [8]:
df_tlg_bow = pandas.read_csv(os.path.expanduser('~/cltk_data/user_data/tlg_bow.csv'))
Y = df_tlg_bow['epithet']
X = df_tlg_bow.drop(['epithet', 'id', 'author'], 1)  #? column "Unnamed: 0"
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)

In [25]:
def df_to_tensor(df):
    np_array = df.as_matrix()
    tensor = tf.convert_to_tensor(np_array, dtype=tf.string)
    return tf.matmul(tensor, tensor) + tensor

In [37]:



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-37-b9a4bc08d1eb> in <module>()
----> 1 tf.matmul(tensor, tensor) + tensor

/root/venv/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py in matmul(a, b, transpose_a, transpose_b, a_is_sparse, b_is_sparse, name)
   1396                                    transpose_a=transpose_a,
   1397                                    transpose_b=transpose_b,
-> 1398                                    name=name)
   1399 
   1400 sparse_matmul = gen_math_ops._sparse_mat_mul

/root/venv/lib/python3.5/site-packages/tensorflow/python/ops/gen_math_ops.py in _mat_mul(a, b, transpose_a, transpose_b, name)
   1346   """
   1347   result = _op_def_lib.apply_op("MatMul", a=a, b=b, transpose_a=transpose_a,
-> 1348                                 transpose_b=transpose_b, name=name)
   1349   return result
   1350 

/root/venv/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py in apply_op(self, op_type_name, name, **keywords)
    571             for base_type in base_types:
    572               _SatisfiesTypeConstraint(base_type,
--> 573                                        _Attr(op_def, input_arg.type_attr))
    574             attrs[input_arg.type_attr] = attr_value
    575             inferred_from[input_arg.type_attr] = input_name

/root/venv/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py in _SatisfiesTypeConstraint(dtype, attr_def)
     58           "DataType %s for attr '%s' not in list of allowed values: %s" %
     59           (dtypes.as_dtype(dtype).name, attr_def.name,
---> 60            ", ".join(dtypes.as_dtype(x).name for x in allowed_list)))
     61 
     62 

TypeError: DataType string for attr 'T' not in list of allowed values: float16, float32, float64, int32, complex64, complex128

In [13]:
type(extract_pandas_data(y_test))


Out[13]:
pandas.core.series.Series

In [6]:
def main(x_train, x_test, y_train, y_test):
    global n_words
    
    t0 = dt.datetime.utcnow()
    
  # Prepare training and testing data
#   dbpedia = learn.datasets.load_dataset(
#       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
#   x_train = pandas.DataFrame(dbpedia.train.data)[1]
#   y_train = pandas.Series(dbpedia.train.target)
#   x_test = pandas.DataFrame(dbpedia.test.data)[1]
#   y_test = pandas.Series(dbpedia.test.target)

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))
    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Build model
    classifier = learn.Estimator(model_fn=bag_of_words_model)

    # Train and predict
    classifier.fit(x_train, y_train, steps=5)  #! was 100
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)]
    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))
    
    print('... finished in {}'.format(dt.datetime.utcnow() - t0))

In [7]:
tf.app.run(x_train, x_test, y_train, y_test)


WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmpwk5zdkxb
WARNING:tensorflow:Using default config.
Total words: 59206
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-7-44cda3e31e6a> in <module>()
----> 1 tf.app.run()

/root/venv/lib/python3.5/site-packages/tensorflow/python/platform/app.py in run(main)
     28   flags_passthrough = f._parse_flags()
     29   main = main or sys.modules['__main__'].main
---> 30   sys.exit(main(sys.argv[:1] + flags_passthrough))

<ipython-input-6-10374405484b> in main(unused_argv)
     28 
     29     # Train and predict
---> 30     classifier.fit(x_train, y_train, steps=100)
     31     y_predicted = [
     32         p['class'] for p in classifier.predict(x_test, as_iterable=True)]

/root/venv/lib/python3.5/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py in fit(self, x, y, input_fn, steps, batch_size, monitors, max_steps)
    331                              steps=steps,
    332                              monitors=monitors,
--> 333                              max_steps=max_steps)
    334     logging.info('Loss for final step: %s.', loss)
    335     return self

/root/venv/lib/python3.5/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py in _train_model(self, input_fn, steps, feed_fn, init_op, init_feed_fn, init_fn, device_fn, monitors, log_every_steps, fail_on_nan_loss, max_steps)
    660       features, targets = input_fn()
    661       self._check_inputs(features, targets)
--> 662       train_op, loss_op = self._get_train_ops(features, targets)
    663 
    664       # Add default monitors.

/root/venv/lib/python3.5/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py in _get_train_ops(self, features, targets)
    961       Tuple of train `Operation` and loss `Tensor`.
    962     """
--> 963     _, loss, train_op = self._call_model_fn(features, targets, ModeKeys.TRAIN)
    964     return train_op, loss
    965 

/root/venv/lib/python3.5/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py in _call_model_fn(self, features, targets, mode)
    945       else:
    946         return self._model_fn(features, targets, mode=mode)
--> 947     return self._model_fn(features, targets)
    948 
    949   def _get_train_ops(self, features, targets):

<ipython-input-4-7919983609da> in bag_of_words_model(x, y)
      1 def bag_of_words_model(x, y):
      2   """A bag-of-words model. Note it disregards the word order in the text."""
----> 3   target = tf.one_hot(y, 15, 1, 0)
      4   word_vectors = learn.ops.categorical_variable(x, n_classes=n_words,
      5       embedding_size=EMBEDDING_SIZE, name='words')

/root/venv/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py in one_hot(indices, depth, on_value, off_value, axis, dtype, name)
   2211 
   2212     return gen_array_ops._one_hot(indices, depth, on_value, off_value, axis,
-> 2213                                   name)
   2214 
   2215 

/root/venv/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py in _one_hot(indices, depth, on_value, off_value, axis, name)
   1637   result = _op_def_lib.apply_op("OneHot", indices=indices, depth=depth,
   1638                                 on_value=on_value, off_value=off_value,
-> 1639                                 axis=axis, name=name)
   1640   return result
   1641 

/root/venv/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py in apply_op(self, op_type_name, name, **keywords)
    571             for base_type in base_types:
    572               _SatisfiesTypeConstraint(base_type,
--> 573                                        _Attr(op_def, input_arg.type_attr))
    574             attrs[input_arg.type_attr] = attr_value
    575             inferred_from[input_arg.type_attr] = input_name

/root/venv/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py in _SatisfiesTypeConstraint(dtype, attr_def)
     58           "DataType %s for attr '%s' not in list of allowed values: %s" %
     59           (dtypes.as_dtype(dtype).name, attr_def.name,
---> 60            ", ".join(dtypes.as_dtype(x).name for x in allowed_list)))
     61 
     62 

TypeError: DataType string for attr 'TI' not in list of allowed values: uint8, int32, int64

In [ ]: