In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import shutil
import math
import multiprocessing
from datetime import datetime
from tensorflow.python.feature_column import feature_column
print(tf.__version__)
In [2]:
MODEL_NAME = 'reg-model-01'
TRAIN_DATA_FILE = 'data/train-data.csv'
VALID_DATA_FILE = 'data/valid-data.csv'
TEST_DATA_FILE = 'data/test-data.csv'
RESUME_TRAINING = False
PROCESS_FEATURES = True
MULTI_THREADING = False
In [3]:
HEADER = ['key','x','y','alpha','beta','target']
HEADER_DEFAULTS = [[0], [0.0], [0.0], ['NA'], ['NA'], [0.0]]
NUMERIC_FEATURE_NAMES = ['x', 'y']
CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {'alpha':['ax01', 'ax02'], 'beta':['bx01', 'bx02']}
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys())
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
TARGET_NAME = 'target'
UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - {TARGET_NAME})
print("Header: {}".format(HEADER))
print("Numeric Features: {}".format(NUMERIC_FEATURE_NAMES))
print("Categorical Features: {}".format(CATEGORICAL_FEATURE_NAMES))
print("Target: {}".format(TARGET_NAME))
print("Unused Features: {}".format(UNUSED_FEATURE_NAMES))
In [4]:
def process_dataframe(dataset_df):
dataset_df["x_2"] = np.square(dataset_df['x'])
dataset_df["y_2"] = np.square(dataset_df['y'])
dataset_df["xy"] = dataset_df['x'] * dataset_df['y']
dataset_df['dist_xy'] = np.sqrt(np.square(dataset_df['x']-dataset_df['y']))
return dataset_df
def generate_pandas_input_fn(file_name, mode=tf.estimator.ModeKeys.EVAL,
skip_header_lines=0,
num_epochs=1,
batch_size=100):
df_dataset = pd.read_csv(file_name, names=HEADER, skiprows=skip_header_lines)
x = df_dataset[FEATURE_NAMES].copy()
if PROCESS_FEATURES:
x = process_dataframe(x)
y = df_dataset[TARGET_NAME]
shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
num_threads=1
if MULTI_THREADING:
num_threads=multiprocessing.cpu_count()
num_epochs = int(num_epochs/num_threads) if mode == tf.estimator.ModeKeys.TRAIN else num_epochs
pandas_input_fn = tf.estimator.inputs.pandas_input_fn(
batch_size=batch_size,
num_epochs= num_epochs,
shuffle=shuffle,
x=x,
y=y,
target_column=TARGET_NAME
)
print("")
print("* data input_fn:")
print("================")
print("Input file: {}".format(file_name))
print("Dataset size: {}".format(len(df_dataset)))
print("Batch size: {}".format(batch_size))
print("Epoch Count: {}".format(num_epochs))
print("Mode: {}".format(mode))
print("Thread Count: {}".format(num_threads))
print("Shuffle: {}".format(shuffle))
print("================")
print("")
return pandas_input_fn
In [5]:
features, target = generate_pandas_input_fn(file_name=TRAIN_DATA_FILE)()
print("Feature read from DataFrame: {}".format(list(features.keys())))
print("Target read from DataFrame: {}".format(target))
In [6]:
def get_feature_columns():
all_numeric_feature_names = NUMERIC_FEATURE_NAMES
CONSTRUCTED_NUMERIC_FEATURES_NAMES = ['x_2', 'y_2', 'xy', 'dist_xy']
if PROCESS_FEATURES:
all_numeric_feature_names += CONSTRUCTED_NUMERIC_FEATURES_NAMES
numeric_columns = {feature_name: tf.feature_column.numeric_column(feature_name)
for feature_name in all_numeric_feature_names}
categorical_column_with_vocabulary = \
{item[0]: tf.feature_column.categorical_column_with_vocabulary_list(item[0], item[1])
for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()}
feature_columns = {}
if numeric_columns is not None:
feature_columns.update(numeric_columns)
if categorical_column_with_vocabulary is not None:
feature_columns.update(categorical_column_with_vocabulary)
# add extended features (crossing, bucektization, embedding)
feature_columns['alpha_X_beta'] = tf.feature_column.crossed_column(
[feature_columns['alpha'], feature_columns['beta']], 4)
return feature_columns
feature_columns = get_feature_columns()
print("Feature Columns: {}".format(feature_columns))
In [7]:
def create_estimator(run_config, hparams):
feature_columns = list(get_feature_columns().values())
dense_columns = list(
filter(lambda column: isinstance(column, feature_column._NumericColumn),
feature_columns
)
)
categorical_columns = list(
filter(lambda column: isinstance(column, feature_column._VocabularyListCategoricalColumn) |
isinstance(column, feature_column._BucketizedColumn),
feature_columns)
)
indicator_columns = list(
map(lambda column: tf.feature_column.indicator_column(column),
categorical_columns)
)
estimator_feature_columns = dense_columns + indicator_columns
estimator = tf.estimator.DNNRegressor(
feature_columns= estimator_feature_columns,
hidden_units= hparams.hidden_units,
optimizer= tf.train.AdamOptimizer(),
activation_fn= tf.nn.elu,
dropout= hparams.dropout_prob,
config= run_config
)
print("")
print("Estimator Type: {}".format(type(estimator)))
print("")
return estimator
In [8]:
hparams = tf.contrib.training.HParams(
num_epochs = 100,
batch_size = 500,
hidden_units=[8, 4],
dropout_prob = 0.0)
model_dir = 'trained_models/{}'.format(MODEL_NAME)
run_config = tf.estimator.RunConfig().replace(model_dir=model_dir)
print("Model directory: {}".format(run_config.model_dir))
print("Hyper-parameters: {}".format(hparams))
In [9]:
estimator = create_estimator(run_config, hparams)
In [10]:
train_input_fn = generate_pandas_input_fn(file_name= TRAIN_DATA_FILE,
mode=tf.estimator.ModeKeys.TRAIN,
num_epochs=hparams.num_epochs,
batch_size=hparams.batch_size)
if not RESUME_TRAINING:
shutil.rmtree(model_dir, ignore_errors=True)
tf.logging.set_verbosity(tf.logging.INFO)
time_start = datetime.utcnow()
print("Estimator training started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................")
estimator.train(input_fn = train_input_fn)
time_end = datetime.utcnow()
print(".......................................")
print("Estimator training finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Estimator training elapsed time: {} seconds".format(time_elapsed.total_seconds()))
In [11]:
TEST_SIZE = 5000
test_input_fn = generate_pandas_input_fn(file_name=TEST_DATA_FILE,
mode= tf.estimator.ModeKeys.EVAL,
batch_size= TEST_SIZE)
results = estimator.evaluate(input_fn=test_input_fn)
print("")
print(results)
rmse = round(math.sqrt(results["average_loss"]),5)
print("")
print("RMSE: {}".format(rmse))
In [12]:
import itertools
predict_input_fn = generate_pandas_input_fn(file_name=TEST_DATA_FILE,
mode= tf.estimator.ModeKeys.PREDICT,
batch_size= 5)
predictions = estimator.predict(input_fn=predict_input_fn)
values = list(map(lambda item: item["predictions"][0],list(itertools.islice(predictions, 5))))
print()
print("Predicted Values: {}".format(values))
In [1]:
def process_features(features):
features["x_2"] = tf.square(features['x'])
features["y_2"] = tf.square(features['y'])
features["xy"] = tf.multiply(features['x'], features['y'])
features['dist_xy'] = tf.sqrt(tf.squared_difference(features['x'],features['y']))
return features
def csv_serving_input_fn():
SERVING_HEADER = ['x','y','alpha','beta']
SERVING_HEADER_DEFAULTS = [[0.0], [0.0], ['NA'], ['NA']]
rows_string_tensor = tf.placeholder(dtype=tf.string,
shape=[None],
name='csv_rows')
receiver_tensor = {'csv_rows': rows_string_tensor}
row_columns = tf.expand_dims(rows_string_tensor, -1)
columns = tf.decode_csv(row_columns, record_defaults=SERVING_HEADER_DEFAULTS)
features = dict(zip(SERVING_HEADER, columns))
if PROCESS_FEATURES:
features = process_features(features)
return tf.estimator.export.ServingInputReceiver(
features, receiver_tensor)
In [31]:
export_dir = model_dir + "/export"
estimator.export_savedmodel(
export_dir_base = export_dir,
serving_input_receiver_fn = csv_serving_input_fn,
as_text=True
)
Out[31]:
In [35]:
import os
saved_model_dir = export_dir + "/" + os.listdir(path=export_dir)[-1]
print(saved_model_dir)
predictor_fn = tf.contrib.predictor.from_saved_model(
export_dir = saved_model_dir,
signature_def_key="predict"
)
output = predictor_fn({'csv_rows': ["0.5,1,ax01,bx02", "-0.5,-1,ax02,bx02"]})
print(output)
In [ ]: