Temporal-Comorbidity Adjusted Risk of Emergency Readmission (TCARER)

Wide & Deep Neural Network (WDNN) Model

1. Initialise <br> 2. Read Data & Store CSV <br> 3. Set TensorFlow Settings <br> 4. Model

This Jupyter iPython Notebook applies the Temporal-Comorbidity Adjusted Risk of Emergency Readmission (TCARER).

This Notebook extract aggregated features from the MySQL database, & then pre-process, configure & apply a Wide & Deep Neural Network (WDNN) model.

Note that some of the scripts are optional or subject to some pre-configurations. Please refer to the comments & the project documentations for further details.

It is licensed under the Apache License, Version 2.0. you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.</font>

<hr>

1. Initialise



In [ ]:

    
# Reload modules 
# It is an optional step. It is useful to run when external Python modules are being modified
# It is reloading all modules (except those excluded by %aimport) every time before executing the Python code typed.
# Note: It may conflict with serialisation, when external modules are being modified

# %load_ext autoreload 
# %autoreload 2



In [ ]:

    
# Import Python libraries
import logging
import os
import sys
import gc
import pandas as pd
from IPython.display import display, HTML
from collections import OrderedDict



In [ ]:

    
# Import local Python modules
from Configs.CONSTANTS import CONSTANTS
from Configs.Logger import Logger
from Features.Variables import Variables
from ReadersWriters.ReadersWriters import ReadersWriters
from Stats.PreProcess import PreProcess
from Stats.FeatureSelection import FeatureSelection
from Stats.TrainingMethod import TrainingMethod
from Stats.Plots import Plots



In [ ]:

    
# Check the interpreter
print("\nMake sure the correct Python interpreter is used!")
print(sys.version)
print("\nMake sure sys.path of the Python interpreter is correct!")
print(os.getcwd())

Import the Tensorflow libraries & check the version &amp local devices



In [ ]:

    
# Tensorflow
import tensorflow as tf 
import tempfile
from tensorflow.python.client import device_lib 

print(tf.__version__)
print(device_lib.list_local_devices())

1.1. Initialise General Settings



In [ ]:

    
config_path = os.path.abspath("ConfigInputs/CONFIGURATIONS.ini")
io_path = os.path.abspath("../../tmp/TCARER/Basic_prototype")
app_name = "T-CARER"
submodel_name = "hesIp"
submodel_input_name = "tcarer_model_features_ip"

print("\n The full path of the configuration file: \n\t", config_path,
      "\n The full path of the output folder: \n\t", io_path,
      "\n The application name (the suffix of the outputs file name): \n\t", app_name,
      "\n The sub-model name, to locate the related feature configuration: \n\t", submodel_name,
      "\n The the sub-model's the file name of the input: \n\t", submodel_input_name)



In [ ]:

    
# Initialise the logs
if not os.path.exists(io_path):
    os.makedirs(io_path, exist_ok=True)

logger = Logger(path=io_path, app_name=app_name, ext="log")
logger = logging.getLogger(app_name)



In [ ]:

    
# Initialise constants          
CONSTANTS.set(io_path, app_name)



In [ ]:

    
# Initialise other classes
readers_writers = ReadersWriters()
preprocess = PreProcess(CONSTANTS.io_path)
feature_selection = FeatureSelection()
plots = Plots()



In [ ]:

    
# Set print settings
pd.set_option('display.width', 1600, 'display.max_colwidth', 800)

1.2. Initialise Features Metadata

Read features metadata



In [ ]:

    
# variables settings
features_metadata = dict()

features_metadata_all = readers_writers.load_csv(path=CONSTANTS.io_path, title=CONSTANTS.config_features_path, dataframing=True)
features_metadata = features_metadata_all.loc[(features_metadata_all["Selected"] == 1) & 
                                              (features_metadata_all["Table_Reference_Name"] == submodel_name)]
features_metadata.reset_index()
    
# print
display(features_metadata)

Set features' metadata dictionaries



In [ ]:

    
# Dictionary of features types, dtypes, & max-states
features_types = dict()
features_dtypes = dict()
features_states_values = dict()
features_names_group = dict()

for _, row in features_metadata.iterrows():
    if not pd.isnull(row["Variable_Max_States"]):
        states_values = str(row["Variable_Max_States"]).split(',') 
        states_values = list(map(int, states_values))
    else: 
        states_values = None
        
    if not pd.isnull(row["Variable_Aggregation"]):
        postfixes = row["Variable_Aggregation"].replace(' ', '').split(',')
        f_types = row["Variable_Type"].replace(' ', '').split(',')
        f_dtypes = row["Variable_dType"].replace(' ', '').split(',')
        for p in range(len(postfixes)):
            features_types[row["Variable_Name"] + "_" + postfixes[p]] = f_types[p]
            features_dtypes[row["Variable_Name"] + "_" + postfixes[p]] = pd.Series(dtype=f_dtypes[p])
            features_states_values[row["Variable_Name"] + "_" + postfixes[p]] = states_values
            features_names_group[row["Variable_Name"] + "_" + postfixes[p]] = row["Variable_Name"] + "_" + postfixes[p]
    else:
        features_types[row["Variable_Name"]] = row["Variable_Type"]
        features_dtypes[row["Variable_Name"]] = row["Variable_dType"]
        features_states_values[row["Variable_Name"]] = states_values
        features_names_group[row["Variable_Name"]] = row["Variable_Name"]
        if states_values is not None:
            for postfix in states_values:
                features_names_group[row["Variable_Name"] + "_" + str(postfix)] = row["Variable_Name"]
            
features_dtypes = pd.DataFrame(features_dtypes).dtypes



In [ ]:

    
# Dictionary of features groups
features_types_group = OrderedDict()

f_types = set([f_type for f_type in features_types.values()])
features_types_group = OrderedDict(zip(list(f_types), [set() for _ in range(len(f_types))]))
for f_name, f_type in features_types.items():
    features_types_group[f_type].add(f_name)
    
print("Features types: " + ','.join(f_types))

Prerequisites:

Run this notebook after the feature selection (Section 7) of the TCARER_Basic Notebook!
Make sure the following files are present in the input folder:
- Step_05_Features.bz2
- Step_07_Top_Features_...

1.3. Load the Top Features

Configure: the selected features

Load the top features



In [ ]:

    
file_name = "Step_07_Top_Features_rfc_adhoc" 

features_names_selected = readers_writers.load_csv(path=CONSTANTS.io_path, title=file_name, dataframing=False)[0]
features_names_selected = [f.replace("\n", "") for f in features_names_selected]
display(pd.DataFrame(features_names_selected))

Exclude the encoded categorical & include the raw categorical features



In [ ]:

    
excludes = set([f for f in features_names_selected for f_cat in features_types_group["CATEGORICAL"] if f.startswith(f_cat)])
features_names_selected_raw = [f for f in features_names_selected if f not in excludes]
features_names_selected_raw = list(features_types_group["CATEGORICAL"]) + features_names_selected_raw

print("Exclude encoded categorical: ", excludes)
print("Include raw categorical: ", features_types_group["CATEGORICAL"])

Select the top N features



In [ ]:

    
top_n_features = 300

features_names_selected_raw = features_names_selected_raw[0:top_n_features]

1.4. Initialise Model Setting

Configure: the model files



In [ ]:

    
# select the target variable
target_feature = "label30" # "label30", "label365"
rank_models = ["rfc"] # ["rfc", "gbrt", "randLogit"]

features_headers = [target_feature] + features_names_selected_raw
train_file_names = ["tensorflow_feature_train"]
test_file_names = ["tensorflow_feature_test"]

train_file_names_full = [os.path.join(CONSTANTS.io_path, name + ".csv") for name in train_file_names]
test_file_names_full = [os.path.join(CONSTANTS.io_path, name + ".csv") for name in test_file_names]

2. Read Data & Store CSV

Read



In [ ]:

    
file_name = "Step_05_Features"
features = readers_writers.load_serialised_compressed(path=CONSTANTS.io_path, title=file_name)
  
print("File size: ", os.stat(os.path.join(CONSTANTS.io_path, file_name + ".bz2")).st_size)
print("Number of columns: ", len(features["train_indep"].columns)) 
print("features: {train: ", len(features["train_indep"]), ", test: ", len(features["test_indep"]), "}")

Visual verification



In [ ]:

    
display(pd.concat([features["train_id"].head(), features["train_target"].head(), features["train_indep"].head()], axis=1))
display(pd.concat([features["test_id"].head(), features["test_target"].head(), features["test_indep"].head()], axis=1))

Select features and save to CSV



In [ ]:

    
# save train sample
readers_writers.save_csv(
    data=pd.concat([features["train_target"].loc[:, [target_feature]], 
                    features["train_indep"].loc[:, features_names_selected_raw]], axis=1), 
    path=CONSTANTS.io_path, title=train_file_names[0], append=False)
print("File size: ", os.stat(os.path.join(CONSTANTS.io_path, train_file_names[0] + ".csv")).st_size)

# save test sample
readers_writers.save_csv(
    data=pd.concat([features["test_target"].loc[:, [target_feature]], 
                    features["test_indep"].loc[:, features_names_selected_raw]], axis=1), 
    path=CONSTANTS.io_path, title=test_file_names[0], append=False)
print("File size: ", os.stat(os.path.join(CONSTANTS.io_path, test_file_names[0] + ".csv")).st_size)

Clean-up



In [ ]:

    
features = None
gc.collect()

3. Set TensorFlow Settings

Configure: the Deep Neural Network nodes

3.1. Prepare Features

Update features by type



In [ ]:

    
# update features
names = [i for i in features_types_group["CATEGORICAL"]]
for name in names:
    if name not in features_names_selected_raw :
        features_types_group["CATEGORICAL"].remove(name)

names = [i for i in features_types_group["CONTINUOUS"]]
for name in names:
    if name not in features_names_selected_raw :
        features_types_group["CONTINUOUS"].remove(name)
        
print("Categorical Features: ", features_types_group["CATEGORICAL"]) 
print("Continuous Features: ", features_types_group["CONTINUOUS"])

Additional variables to convert to discrete



In [ ]:

    
names = [i for i in features_types_group["CONTINUOUS"]]
features_types_group["CATEGORICAL_EXTRA"] = list()

# convet gapDay_..., & epidur_... variables
# states = [0, 3, 7, 14, 30, 60]
for name in names:
    if name[0:7] == "gapDays_" or name[0:7] == "epidur_":
        features_types_group["CONTINUOUS"].remove(name)
        features_types_group["CATEGORICAL"].add(name)
        features_types_group["CATEGORICAL_EXTRA"].append(name)
        features_states_values[name] = [0, 3, 7, 14, 30, 60]

3.2. Define Base Features



In [ ]:

    
feature_columns = dict()

# Categorical base columns.
for name in features_types_group["CATEGORICAL"]:
    feature_columns[name] = tf.contrib.layers.sparse_column_with_hash_bucket(
        name, hash_bucket_size=len(features_states_values[name]), combiner="sqrtn")

# Continuous base columns.
for name in features_types_group["CONTINUOUS"]:
    if features_states_values[name] is not None:
        feature_columns[name] = tf.contrib.layers.real_valued_column(name)
        feature_columns[name] = tf.contrib.layers.bucketized_column(
            feature_columns[name], [int(i) for i in features_states_values[name]])
    else:
        feature_columns[name] = tf.contrib.layers.real_valued_column(name)

3.3. The Wide Model



In [ ]:

    
wide_columns = [feature_columns[name] for name in features_types_group["CATEGORICAL"]]

wide_columns = wide_columns + \
    [feature_columns[name] for name in features_types_group["CONTINUOUS"] if features_states_values[name] is not None]
    
wide_columns = wide_columns + [
    tf.contrib.layers.crossed_column([feature_columns['ethnos'], feature_columns['gender']], 
                                     combiner="sqrtn", hash_bucket_size=int(2)),
    tf.contrib.layers.crossed_column([feature_columns['imd04rk'], feature_columns['ethnos']], 
                                     combiner="sqrtn", hash_bucket_size=int(4)),
    tf.contrib.layers.crossed_column([feature_columns['imd04rk'], feature_columns['ageTrigger']], 
                                     combiner="sqrtn", hash_bucket_size=int(10))]

# for name in features_types_group["CATEGORICAL_EXTRA"]:
#    wide_columns = wide_columns + [
#        tf.contrib.layers.crossed_column([feature_columns['ageTrigger'], feature_columns[name]], 
#                                     combiner="sqrtn", hash_bucket_size=int(6e3))]

print(wide_columns)

3.4. The Deep Model



In [ ]:

    
print(features_types_group["CATEGORICAL"])
print([name for name in features_types_group["CONTINUOUS"] if features_states_values[name] is not None])
print([name for name in features_types_group["CONTINUOUS"] if features_states_values[name] is None])



In [ ]:

    
deep_columns = [feature_columns[name] for name in features_types_group["CONTINUOUS"]]
    
deep_columns = deep_columns + \
    [tf.contrib.layers.embedding_column(feature_columns["gender"], dimension=2),
     tf.contrib.layers.embedding_column(feature_columns["ethnos"], dimension=3),
    tf.contrib.layers.embedding_column(feature_columns["imd04rk"], dimension=5),
    tf.contrib.layers.embedding_column(feature_columns["ageTrigger"], dimension=5)]
    
for name in features_types_group["CATEGORICAL_EXTRA"]:
    deep_columns = deep_columns + \
        [tf.contrib.layers.embedding_column(feature_columns[name], dimension=3)]
        
print(deep_columns)

Set the lists of continous and discrete function



In [ ]:

    
continuous_features = list(features_types_group["CONTINUOUS"])
discrete_features = list(features_types_group["CATEGORICAL"])

4. Model

Restore model if it was interupated



In [ ]:

    
# model_dir = "/tmp/tmpn5lud12q"
# train_steps = 3518



In [ ]:

    
# Restore variables from disk.
# saver = tf.train.Saver()
# sess = tf.Session()
# saver.restore(sess, model_dir)

4.1. Initialise

Configure the size and batches of the Deep Neural Network



In [ ]:

    
train_batch_size = 2000
train_steps = 500 # 40000
train_num_epochs = None
train_randomize_input = True

test_batch_size = 2000
test_steps = 500 # 300
test_num_epochs = None
test_randomize_input = False

monitor_batch_size = 2000
monitor_steps = 200
monitor_num_epochs = None
monitor_randomize_input = False

dnn_hidden_units = [24000, 12000, 6000] # [20000, 16000, 10000, 8000, 7000, 6000, 5000, 4000] # [28000, 14000, 7000]  # [24000, 12000, 6000]

Initialise the perfromance statistics output



In [ ]:

    
summaries = dict()

Set the output directory of the Tensorflow model



In [ ]:

    
model_dir = tempfile.mkdtemp()

Combining Wide and Deep Models into One



In [ ]:

    
config = tf.ConfigProto(allow_soft_placement=True)

model_dnn = tf.contrib.learn.DNNLinearCombinedClassifier(
                model_dir=model_dir,
                linear_feature_columns=wide_columns,
                config=None, # tf.contrib.learn.RunConfig(save_checkpoints_secs=600)),
                dnn_feature_columns=deep_columns,
                dnn_hidden_units=dnn_hidden_units,
                dnn_optimizer=None, # tf.train.AdagradOptimizer(...)
                linear_optimizer=None, # tf.train.FtrlOptimizer(...)
                dnn_activation_fn=tf.nn.relu,
                enable_centered_bias=False
                #, gradient_clip_norm=1 # helper functions that let you apply L2 norms (tf.clip_by_global_norm)
                )

print(model_dir)

Set the validation monitor



In [ ]:

    
validation_mointor = tf.contrib.learn.monitors.ValidationMonitor(
    input_fn=lambda: input_fn(test_file_names_full, monitor_batch_size, 
                              monitor_num_epochs, monitor_randomize_input),
    every_n_steps=monitor_steps)

Define a function for reading a sample batch by batch



In [ ]:

    
def read_csv_batches(file_names, batch_size, features_headers, num_epochs, randomize_input):

    def parse_fn(record):
        record_defaults = [tf.constant([''], dtype=tf.string)] * len(features_headers)
        return tf.decode_csv(record, record_defaults)

    df = tf.contrib.learn.read_batch_examples(
        file_names,
        batch_size=batch_size,
        reader=tf.TextLineReader,
        parse_fn=parse_fn,
        num_epochs=num_epochs,
        randomize_input=randomize_input)

    # Important: convert examples to dict for ease of use in `input_fn`
    # Map each header to its respective column (FEATURE_HEADERS order matters!
    df_dict = {}
    for i, header in enumerate(features_headers):
        df_dict[header] = df[:, i]

    return df_dict

Represent the input data as the fundamental unit of TensorFlow computations



In [ ]:

    
def input_fn(file_names, batch_size, num_epochs, randomize_input):
    df_dict = read_csv_batches(file_names, batch_size, features_headers, num_epochs, randomize_input)

    with tf.Session(config=config) as sess:
        # Creates a dictionary mapping from each continuous feature column name (k) to
        # the values of that column stored in a constant Tensor.
        continuous_cols = {k: tf.string_to_number(df_dict[k], out_type=tf.float32)
                           for k in continuous_features}

        # Creates a dictionary mapping from each categorical feature column name (k)
        # to the values of that column stored in a tf.SparseTensor.
        categorical_cols = {
            k: tf.SparseTensor(
                indices=[[i, 0] for i in range(int(df_dict[k].get_shape()[0]))],
                values=df_dict[k],
                dense_shape=[int(df_dict[k].get_shape()[0]), 1]) 
            for k in discrete_features}

        # Merges the two dictionaries into one.
        feature_cols = {**continuous_cols, **categorical_cols}

        # Converts the label column into a constant Tensor.
        label = tf.string_to_number(df_dict[target_feature], out_type=tf.int32)
   
    # Returns the feature columns and the label.
    return feature_cols, label

4.2. Fit

Train the Deep Neural Network



In [ ]:

    
train_randomize_input



In [ ]:

    
model_dnn.fit(input_fn=lambda: input_fn(train_file_names_full, train_batch_size, 
                                        train_num_epochs, train_randomize_input), 
              steps=train_steps) # , monitors=[validation_mointor]

Save the output summaries



In [ ]:

    
summaries["fit"] = dict()    
summaries["fit"]["get_variable_names"] = str(model_dnn.get_variable_names)
summaries["fit"]["get_variable_value"] = str(model_dnn.get_variable_value)
summaries["fit"]["get_params"] = str(model_dnn.get_params)
summaries["fit"]["export"] = str(model_dnn.export)
summaries["fit"]["get_variable_names()"] = model_dnn.get_variable_names()
summaries["fit"]["params"] = str(model_dnn.params)

4.3. Predict - Train Sample

Test the Deep Neural Network, using the train sample



In [ ]:

    
results = model_dnn.evaluate(input_fn=lambda: input_fn(train_file_names_full, test_batch_size, 
                                                       test_num_epochs, test_randomize_input), 
                             steps=test_steps)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

Save the output summaries



In [ ]:

    
summaries["train"] = dict()
summaries["train"]["results"] = results 
summaries["train"]["predict_proba"] = model_dnn.predict_proba(
    input_fn=lambda: input_fn(train_file_names_full, test_batch_size, 
                              test_num_epochs, test_randomize_input))

4.4. Predict - Test Sample

Test the Deep Neural Network, using the test sample



In [ ]:

    
results = model_dnn.evaluate(input_fn=lambda: input_fn(test_file_names_full, test_batch_size, 
                                                       test_num_epochs, test_randomize_input), 
                             steps=test_steps)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

Save the output summaries



In [ ]:

    
summaries["test"] = dict()
summaries["test"]["results"] = results 
summaries["test"]["predict_proba"] = model_dnn.predict_proba(
    input_fn=lambda: input_fn(test_file_names_full, test_batch_size, 
                              test_num_epochs, test_randomize_input))

4.5. Save

Save the output summaries, including the predicted probabilities

4.5.1. Stats



In [ ]:

    
def generator_to_list(generator, max_size):
    j = 0
    temp = [None] * max_size

    for value in generator:
        temp[j] = value
        j += 1
        if j >= max_size:
            break

    return temp



In [ ]:

    
summaries["train"]["predict_proba"] = \
    generator_to_list(summaries["train"]["predict_proba"], test_batch_size  * test_steps)
summaries["test"]["predict_proba"] = \
    generator_to_list(summaries["test"]["predict_proba"], test_batch_size  * test_steps)



In [ ]:

    
file_name = "model_tensorflow_summaries_" + target_feature
readers_writers.save_serialised_compressed(path=CONSTANTS.io_path, title=file_name, objects=summaries)

Test the saved file for corruption



In [ ]:

    
print("The model temp. directory to back up:")
print(model_dir)

Fin!