In [ ]:
# %%bash

# pip install tensorflow==1.7
# pip install tensorflow-transform

Text Classification using TensorFlow and Google Cloud - Part 4

This bigquery-public-data:hacker_news contains all stories and comments from Hacker News from its launch in 2006. Each story contains a story id, url, the title of the story, tthe author that made the post, when it was written, and the number of points the story received.

The objective is, given the title of the story, we want to build an ML model that can predict the source of this story.

TF DNNClassifier with TF.IDF Text Reprsentation

This notebook illustrates how to build a TF premade estimator, namely DNNClassifier, while the input text will be repesented as TF.IDF computed during the preprocessing phase in Part 1. The overall steps are as follows:

  1. Define the metadata
  2. Define data input function
  3. Create feature columns (using the tfidf)
  4. Create the premade DNNClassifier estimator
  5. Setup experiement
    • Hyper-parameters & RunConfig
    • Serving function (for exported model)
    • TrainSpec & EvalSpec
  6. Run experiement
  7. Evalute the model
  8. Use SavedModel for prediction

Setting Global Parameters


In [1]:
import os

class Params:
    pass

# Set to run on GCP
Params.GCP_PROJECT_ID = 'ksalama-gcp-playground'
Params.REGION = 'europe-west1'
Params.BUCKET = 'ksalama-gcs-cloudml'

Params.PLATFORM = 'local' # local | GCP

Params.DATA_DIR = 'data/news'  if Params.PLATFORM == 'local' else 'gs://{}/data/news'.format(Params.BUCKET)

Params.TRANSFORMED_DATA_DIR = os.path.join(Params.DATA_DIR, 'transformed')
Params.TRANSFORMED_TRAIN_DATA_FILE_PREFIX = os.path.join(Params.TRANSFORMED_DATA_DIR, 'train')
Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX = os.path.join(Params.TRANSFORMED_DATA_DIR, 'eval')

Params.TEMP_DIR = os.path.join(Params.DATA_DIR, 'tmp')

Params.MODELS_DIR = 'models/news' if Params.PLATFORM == 'local' else 'gs://{}/models/news'.format(Params.BUCKET)

Params.TRANSFORM_ARTEFACTS_DIR = os.path.join(Params.MODELS_DIR,'transform')

Params.TRAIN = True

Params.RESUME_TRAINING = False

Params.EAGER = False

if Params.EAGER:
    tf.enable_eager_execution()

Importing libraries


In [2]:
import tensorflow as tf
from tensorflow import data


from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tensorflow_transform.tf_metadata import metadata_io
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.saved import saved_transform_io

print tf.__version__


WARNING:tensorflow:From /Users/khalidsalama/Technology/python-venvs/py27-venv/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
1.7.0

1. Define Metadata


In [3]:
RAW_HEADER = 'key,title,source'.split(',')
RAW_DEFAULTS = [['NA'],['NA'],['NA']]
TARGET_FEATURE_NAME = 'source'
TARGET_LABELS = ['github', 'nytimes', 'techcrunch']
TEXT_FEATURE_NAME = 'title'
KEY_COLUMN = 'key'

VOCAB_SIZE = 20000
TRAIN_SIZE = 73124
EVAL_SIZE = 23079

DELIMITERS = '.,!?() '

raw_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema({
    KEY_COLUMN: dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation()),
    TEXT_FEATURE_NAME: dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation()),
    TARGET_FEATURE_NAME: dataset_schema.ColumnSchema(
        tf.string, [], dataset_schema.FixedColumnRepresentation()),
}))


transformed_metadata = metadata_io.read_metadata(
    os.path.join(Params.TRANSFORM_ARTEFACTS_DIR,"transformed_metadata"))

raw_feature_spec = raw_metadata.schema.as_feature_spec()
transformed_feature_spec = transformed_metadata.schema.as_feature_spec()

print transformed_feature_spec


{u'source': FixedLenFeature(shape=[], dtype=tf.string, default_value=None), u'title': FixedLenFeature(shape=[], dtype=tf.string, default_value=None), u'weight': VarLenFeature(dtype=tf.float32), u'bow': VarLenFeature(dtype=tf.int64)}

2. Define Input Function


In [4]:
def parse_tf_example(tf_example):
    
    parsed_features = tf.parse_single_example(serialized=tf_example, features=transformed_feature_spec)
    target = parsed_features.pop(TARGET_FEATURE_NAME)
    
    return parsed_features, target


def generate_tfrecords_input_fn(files_pattern, 
                          mode=tf.estimator.ModeKeys.EVAL, 
                          num_epochs=1, 
                          batch_size=200):
    
    def _input_fn():
        
        file_names = data.Dataset.list_files(files_pattern)

        if Params.EAGER:
            print file_names

        dataset = data.TFRecordDataset(file_names )

        dataset = dataset.apply(
                tf.contrib.data.shuffle_and_repeat(count=num_epochs,
                                                   buffer_size=batch_size*2)
        )

        dataset = dataset.apply(
                tf.contrib.data.map_and_batch(parse_tf_example, 
                                              batch_size=batch_size, 
                                              num_parallel_batches=2)
        )

        datset = dataset.prefetch(batch_size)

        if Params.EAGER:
            return dataset

        iterator = dataset.make_one_shot_iterator()
        features, target = iterator.get_next()
        return features, target
    
    return _input_fn

3. Create feature columns


In [5]:
BOW_FEATURE_NAME = 'bow'
TFIDF_FEATURE_NAME = 'weight'

def create_feature_columns():
    
    # Get word indecies from bow
    bow = tf.feature_column.categorical_column_with_identity(
      BOW_FEATURE_NAME, num_buckets=VOCAB_SIZE + 1)
    
    # Add weight to the word indecies
    weight_bow = tf.feature_column.weighted_categorical_column(
      bow, TFIDF_FEATURE_NAME)
    
    # Convert to indicator 
    weight_bow_indicators = tf.feature_column.indicator_column(weight_bow)
    
    return [weight_bow_indicators]

4. Create a model using a premade DNNClassifer


In [6]:
def create_estimator(hparams, run_config):
    
    feature_columns = create_feature_columns()
    
    optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)
    
    estimator = tf.estimator.DNNClassifier(
        feature_columns=feature_columns,
        n_classes =len(TARGET_LABELS),
        label_vocabulary=TARGET_LABELS,
        hidden_units=hparams.hidden_units,
        optimizer=optimizer,
        config=run_config
    )
    
    
    return estimator

5. Setup Experiment

5.1 HParams and RunConfig


In [7]:
NUM_EPOCHS = 10
BATCH_SIZE = 1000

TOTAL_STEPS = (TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS
EVAL_EVERY_SEC = 60

hparams  = tf.contrib.training.HParams(
    num_epochs = NUM_EPOCHS,
    batch_size = BATCH_SIZE,
    learning_rate = 0.01,
    hidden_units=[64, 32],
    max_steps = TOTAL_STEPS,

)

MODEL_NAME = 'dnn_estimator_tfidf' 
model_dir = os.path.join(Params.MODELS_DIR, MODEL_NAME)

run_config = tf.estimator.RunConfig(
    tf_random_seed=19830610,
    log_step_count_steps=1000,
    save_checkpoints_secs=EVAL_EVERY_SEC,
    keep_checkpoint_max=1,
    model_dir=model_dir
)


print(hparams)
print("")
print("Model Directory:", run_config.model_dir)
print("Dataset Size:", TRAIN_SIZE)
print("Batch Size:", BATCH_SIZE)
print("Steps per Epoch:",TRAIN_SIZE/BATCH_SIZE)
print("Total Steps:", TOTAL_STEPS)


[('batch_size', 1000), ('hidden_units', [64, 32]), ('learning_rate', 0.01), ('max_steps', 730), ('num_epochs', 10), ('trainable_embedding', False)]

('Model Directory:', 'models/news/dnn_estimator_tfidf')
('Dataset Size:', 73124)
('Batch Size:', 1000)
('Steps per Epoch:', 73)
('Total Steps:', 730)

5.2 Serving function


In [8]:
def generate_serving_input_fn():
    
    def _serving_fn():
    
        receiver_tensor = {
          'title': tf.placeholder(dtype=tf.string, shape=[None])
        }

        _, transformed_features = (
            saved_transform_io.partially_apply_saved_transform(
                os.path.join(Params.TRANSFORM_ARTEFACTS_DIR, transform_fn_io.TRANSFORM_FN_DIR),
            receiver_tensor)
        )
        
        return tf.estimator.export.ServingInputReceiver(
            transformed_features, receiver_tensor)
    
    return _serving_fn

5.3 TrainSpec & EvalSpec


In [9]:
train_spec = tf.estimator.TrainSpec(
    input_fn = generate_tfrecords_input_fn(
        Params.TRANSFORMED_TRAIN_DATA_FILE_PREFIX+"*",
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs=hparams.num_epochs,
        batch_size=hparams.batch_size
    ),
    max_steps=hparams.max_steps,
    hooks=None
)

eval_spec = tf.estimator.EvalSpec(
    input_fn = generate_tfrecords_input_fn(
        Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX+"*",
        mode=tf.estimator.ModeKeys.EVAL,
        num_epochs=1,
        batch_size=hparams.batch_size
    ),
    exporters=[tf.estimator.LatestExporter(
        name="estimate", # the name of the folder in which the model will be exported to under export
        serving_input_receiver_fn=generate_serving_input_fn(),
        exports_to_keep=1,
        as_text=False)],
    steps=None,
    throttle_secs=EVAL_EVERY_SEC
)

6. Run experiment


In [10]:
from datetime import datetime
import shutil

if Params.TRAIN:
    if not Params.RESUME_TRAINING:
        print("Removing previous training artefacts...")
        shutil.rmtree(model_dir, ignore_errors=True)
    else:
        print("Resuming training...") 


    tf.logging.set_verbosity(tf.logging.INFO)

    time_start = datetime.utcnow() 
    print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 

    estimator = create_estimator(hparams, run_config)

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )

    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
else:
    print "Training was skipped!"


Removing previous training artefacts...
Experiment started at 16:13:21
.......................................
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 60, '_session_config': None, '_keep_checkpoint_max': 1, '_tf_random_seed': 19830610, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11711e1d0>, '_model_dir': 'models/news/dnn_estimator_tfidf', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 1000, '_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_evaluation_master': '', '_service': None, '_save_summary_steps': 100, '_num_ps_replicas': 0}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 60 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into models/news/dnn_estimator_tfidf/model.ckpt.
INFO:tensorflow:loss = 1098.7266, step = 1
INFO:tensorflow:loss = 213.40088, step = 101 (15.307 sec)
INFO:tensorflow:loss = 147.65674, step = 201 (13.971 sec)
INFO:tensorflow:loss = 71.7646, step = 301 (15.121 sec)
INFO:tensorflow:Saving checkpoints for 392 into models/news/dnn_estimator_tfidf/model.ckpt.
INFO:tensorflow:Loss for final step: 26.048763.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-05-14-16:14:22
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/news/dnn_estimator_tfidf/model.ckpt-392
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-05-14-16:14:25
INFO:tensorflow:Saving dict for global step 392: accuracy = 0.8243858, average_loss = 0.94847244, global_step = 392, loss = 912.07477
WARNING:tensorflow:Expected binary or unicode string, got type_url: "type.googleapis.com/tensorflow.AssetFileDef"
value: "\n\t\n\007Const:0\022\033vocab_string_to_int_uniques"

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: ['serving_default', 'classification']
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Restoring parameters from models/news/dnn_estimator_tfidf/model.ckpt-392
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:Assets written to: models/news/dnn_estimator_tfidf/export/estimate/temp-1526314465/assets
INFO:tensorflow:SavedModel written to: models/news/dnn_estimator_tfidf/export/estimate/temp-1526314465/saved_model.pb
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/news/dnn_estimator_tfidf/model.ckpt-392
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 393 into models/news/dnn_estimator_tfidf/model.ckpt.
INFO:tensorflow:loss = 27.088547, step = 393
INFO:tensorflow:loss = 2.9095829, step = 493 (13.979 sec)
INFO:tensorflow:loss = 4.3351374, step = 593 (13.651 sec)
INFO:tensorflow:loss = 11.017786, step = 693 (14.415 sec)
INFO:tensorflow:Saving checkpoints for 730 into models/news/dnn_estimator_tfidf/model.ckpt.
INFO:tensorflow:Loss for final step: 3.2552278.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-05-14-16:15:15
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/news/dnn_estimator_tfidf/model.ckpt-730
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-05-14-16:15:17
INFO:tensorflow:Saving dict for global step 730: accuracy = 0.82416916, average_loss = 1.344607, global_step = 730, loss = 1293.0077
WARNING:tensorflow:Expected binary or unicode string, got type_url: "type.googleapis.com/tensorflow.AssetFileDef"
value: "\n\t\n\007Const:0\022\033vocab_string_to_int_uniques"

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: ['serving_default', 'classification']
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Restoring parameters from models/news/dnn_estimator_tfidf/model.ckpt-730
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:Assets written to: models/news/dnn_estimator_tfidf/export/estimate/temp-1526314518/assets
INFO:tensorflow:SavedModel written to: models/news/dnn_estimator_tfidf/export/estimate/temp-1526314518/saved_model.pb
.......................................
Experiment finished at 16:15:18

Experiment elapsed time: 117.021302 seconds

7. Evaluate the model


In [11]:
tf.logging.set_verbosity(tf.logging.ERROR)

estimator = create_estimator(hparams, run_config)

train_metrics = estimator.evaluate(
    input_fn = generate_tfrecords_input_fn(
        files_pattern= Params.TRANSFORMED_TRAIN_DATA_FILE_PREFIX+"*", 
        mode= tf.estimator.ModeKeys.EVAL,
        batch_size= TRAIN_SIZE), 
    steps=1
)


print("############################################################################################")
print("# Train Measures: {}".format(train_metrics))
print("############################################################################################")

eval_metrics = estimator.evaluate(
    input_fn=generate_tfrecords_input_fn(
        files_pattern= Params.TRANSFORMED_EVAL_DATA_FILE_PREFIX+"*", 
        mode= tf.estimator.ModeKeys.EVAL,
        batch_size= EVAL_SIZE), 
    steps=1
)
print("")
print("############################################################################################")
print("# Eval Measures: {}".format(eval_metrics))
print("############################################################################################")


############################################################################################
# Train Measures: {'average_loss': 0.0037224626, 'accuracy': 0.99904275, 'global_step': 730, 'loss': 272.20135}
############################################################################################

############################################################################################
# Eval Measures: {'average_loss': 1.3446056, 'accuracy': 0.82416916, 'global_step': 730, 'loss': 31032.152}
############################################################################################

8. Use Saved Model for Predictions


In [12]:
import os

export_dir = model_dir +"/export/estimate/"
saved_model_dir = os.path.join(export_dir, os.listdir(export_dir)[0])

print(saved_model_dir)
print("")

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
    signature_def_key="predict"
)

output = predictor_fn(
    {
        'title':[
            'Microsoft and Google are joining forces for a new AI framework',
            'A new version of Python is mind blowing',
            'EU is investigating new data privacy policies'
        ]
        
    }
)
print(output)


models/news/dnn_estimator_tfidf/export/estimate/1526314518

{u'probabilities': array([[0.96217114, 0.01375495, 0.02407398],
       [0.02322701, 0.39720485, 0.5795681 ],
       [0.03017025, 0.9552083 , 0.01462139]], dtype=float32), u'class_ids': array([[0],
       [2],
       [1]]), u'classes': array([['github'],
       ['techcrunch'],
       ['nytimes']], dtype=object), u'logits': array([[ 2.4457023, -1.8020908, -1.2423583],
       [-2.1229138,  0.7162221,  1.0940531],
       [-0.9709409,  2.4841323, -1.6953117]], dtype=float32)}

In [ ]: