Temporal-Comorbidity Adjusted Risk of Emergency Readmission (TCARER)

Summary Reports

1. Initialise


In [1]:
# reload modules
# Reload all modules (except those excluded by %aimport) every time before executing the Python code typed.
%load_ext autoreload 
%autoreload 2

In [2]:
# import libraries
import logging
import os
import sys
import gc
import pandas as pd
import numpy as np
import random
import statistics
from datetime import datetime
from collections import OrderedDict
from sklearn import preprocessing
from scipy.stats import stats
from IPython.display import display, HTML
from pprint import pprint
from pivottablejs import pivot_ui
from IPython.display import clear_output
import imblearn.over_sampling as oversampling
import matplotlib.pyplot as plt

In [4]:
# import local classes
from Configs.CONSTANTS import CONSTANTS
from Configs.Logger import Logger
from Features.Variables import Variables
from ReadersWriters.ReadersWriters import ReadersWriters
from Stats.PreProcess import PreProcess
from Stats.FeatureSelection import FeatureSelection
from Stats.TrainingMethod import TrainingMethod
from Stats.Plots import Plots
from Stats.Stats import Stats

In [5]:
# Check the interpreter
print("\nMake sure the correct Python interpreter is used!")
print(sys.version)
print("\nMake sure sys.path of the Python interpreter is correct!")
print(os.getcwd())


Make sure the correct Python interpreter is used!
3.5.3 (v3.5.3:1880cb95a742, Jan 16 2017, 16:02:32) [MSC v.1900 64 bit (AMD64)]

Make sure sys.path of the Python interpreter is correct!
C:\Users\eagle\Documents\GitHub\Analytics_UoW\TCARER

1.1. Initialise General Settings


In [6]:
# init paths & directories
config_path = os.path.abspath("ConfigInputs/CONFIGURATIONS.ini")
io_path = os.path.abspath("../../tmp/TCARER/Basic_prototype")
schema = "parr_sample_prototype" 
app_name = "T-CARER"

print("Output path:", io_path)


Output path: C:\Users\eagle\Documents\GitHub\tmp\TCARER\Basic_prototype

In [7]:
# init logs
if not os.path.exists(io_path):
    os.makedirs(io_path, exist_ok=True)

logger = Logger(path=io_path, app_name=app_name, ext="log")
logger = logging.getLogger(app_name)


2017-10-29 13:03:26,935 - T-CARER - INFO - Creating 'C:\Users\eagle\Documents\GitHub\tmp\TCARER\Basic_prototype\T-CARER.log' File.

In [8]:
# init constants        
CONSTANTS.set(io_path, app_name)

In [9]:
# initialise other classes
readers_writers = ReadersWriters()
plots = Plots()

In [10]:
# other Constant variables
submodel_name = "hesIp"
submodel_input_name = "tcarer_model_features_ip"

In [11]:
# set print settings
pd.set_option('display.width', 1600, 'display.max_colwidth', 800)

Common variables:

  • Readmission
    • 'label30', 'label365'
  • Admissions Methods:
    • 'admimeth_0t30d_prevalence_1_cnt', ...
  • Prior Spells:
    • 'prior_spells'
  • Male:
    • 'gender_1'
  • LoS:
    • 'trigger_los'
  • Age:
    • 'trigger_age'
  • Charlson Score:
    • 'trigger_charlsonFoster'
  • predictions score
    • score
  • Most prevalent diagnoses groups (0-30-day, 0-730-day):
    • 0-30-day: 'diagCCS_0t30d_prevalence_1_cnt', ...
    • 0-730-day: 'diagCCS_0t30d_prevalence_1_cnt' + 'diagCCS_30t90d_prevalence_1_cnt' + 'diagCCS_90t180d_prevalence_1_cnt' + 'diagCCS_180t365d_prevalence_1_cnt' + 'diagCCS_365t730d_prevalence_1_cnt', ...
  • Comorbidity diagnoses groups (0-730-day):
    • 'prior_admiOther', 'prior_admiAcute', 'prior_spells', 'prior_asthma', 'prior_copd', 'prior_depression', 'prior_diabetes', 'prior_hypertension', 'prior_cancer', 'prior_chd', 'prior_chf'
  • Charlson diagnoses groups (trigger):
    • 'diagCci_01_myocardial_freq__trigger',...



2. Load the Saved Model Outputs

Note: Make sure the following files are located at the input path

  • Step_05_Features.bz2
  • Step_07_Top_Features_...
  • Step_07_Model_Train_model_rank_summaries_...
  • Step_09_Model_...

Note: Create features extra (Run only once)


In [ ]:
# settings
feature_table = 'tcarer_features'
featureExtra_table = 'tcarer_featuresExtra'

In [ ]:
result = readers_writers.load_mysql_procedure("tcarer_set_featuresExtra", [feature_table, featureExtra_table], schema)



2.1. Initialise


In [16]:
# select the target variable
target_feature = "label365" # "label365", "label30"  
method_name = "rfc" # "rfc", "gbrt", "randLogit", "wdnn"
rank_models = ["rfc"] # ["rfc", "gbrt", "randLogit"]



2.2. Load Features

Load pre-processed features


In [17]:
file_name = "Step_07_Features"
features = readers_writers.load_serialised_compressed(path=CONSTANTS.io_path, title=file_name)

# print     
print("File size: ", os.stat(os.path.join(CONSTANTS.io_path, file_name + ".bz2")).st_size)
print("Number of columns: ", len(features["train_indep"].columns)) 
print("features: {train: ", len(features["train_indep"]), ", test: ", len(features["test_indep"]), "}")


File size:  97692
Number of columns:  458
features: {train:  2500 , test:  2499 }

2.3. Load Features Names


In [18]:
file_name = "Step_07_Top_Features_rfc_adhoc" 

features_names_selected = readers_writers.load_csv(path=CONSTANTS.io_path, title=file_name, dataframing=False)[0]
features_names_selected = [f.replace("\n", "") for f in features_names_selected]
display(pd.DataFrame(features_names_selected))


0
0 epidur_0t30d_avg
1 epidur_365t730d_avg
2 preopdur_0t30d_avg
3 gapDays_0t30d_others_cnt
4 epidur_365t730d_others_cnt
5 preopdur_30t90d_others_cnt
6 epidur_0t30d_others_cnt
7 preopdur_0t30d_others_cnt
8 epidur_30t90d_others_cnt
9 preopdur_90t180d_others_cnt
10 epidur_180t365d_avg
11 preopdur_30t90d_avg
12 preopdur_90t180d_avg
13 gapDays_365t730d_avg
14 operOPCSL1_0t30d_prevalence_1_cnt
15 operOPCSL1_0t30d_others_cnt
16 posopdur_30t90d_others_cnt
17 epidur_180t365d_others_cnt
18 posopdur_0t30d_avg
19 preopdur_180t365d_others_cnt
20 operOPCSL1_0t30d_prevalence_3_cnt
21 operOPCSL1_0t30d_prevalence_4_cnt
22 posopdur_365t730d_avg
23 operOPCSL1_0t30d_prevalence_2_cnt
24 epidur_30t90d_avg
25 posopdur_30t90d_avg
26 operOPCSL1_0t30d_prevalence_5_cnt
27 preopdur_180t365d_avg
28 preopdur_365t730d_others_cnt
29 preopdur_365t730d_avg
... ...
370 operOPCSL1_180t365d_prevalence_30_cnt
371 diagCCS_0t30d_prevalence_25_cnt
372 diagCCS_30t90d_prevalence_1_cnt
373 diagCCS_0t30d_prevalence_23_cnt
374 diagCCS_90t180d_prevalence_5_cnt
375 diagCCS_90t180d_prevalence_4_cnt
376 diagCCS_0t30d_prevalence_19_cnt
377 operOPCSL1_365t730d_prevalence_4_cnt
378 diagCCS_90t180d_prevalence_2_cnt
379 diagCCS_90t180d_prevalence_3_cnt
380 diagCCS_0t30d_prevalence_21_cnt
381 diagCCS_30t90d_prevalence_3_cnt
382 diagCCS_30t90d_prevalence_4_cnt
383 diagCCS_30t90d_prevalence_2_cnt
384 operOPCSL1_365t730d_prevalence_5_cnt
385 diagCCS_30t90d_prevalence_5_cnt
386 diagCCS_0t30d_prevalence_20_cnt
387 diagCCS_30t90d_prevalence_6_cnt
388 diagCCS_30t90d_prevalence_7_cnt
389 diagCCS_30t90d_prevalence_8_cnt
390 operOPCSL1_365t730d_prevalence_6_cnt
391 diagCCS_90t180d_prevalence_1_cnt
392 diagCCS_90t180d_others_cnt
393 operOPCSL1_365t730d_prevalence_7_cnt
394 diagCCS_0t30d_prevalence_17_cnt
395 diagCCS_0t30d_prevalence_18_cnt
396 diagCCS_0t30d_prevalence_16_cnt
397 operOPCSL1_365t730d_prevalence_8_cnt
398 diagCCS_30t90d_prevalence_30_cnt
399 diagCCS_30t90d_prevalence_29_cnt

400 rows × 1 columns

2.4. Load the fitted model

2.4.1. Basic Models

Initialise


In [19]:
training_method = TrainingMethod(method_name)

# file name
file_name = "Step_09_Model_" + method_name + "_" + target_feature


2017-10-29 13:04:19,778 - T-CARER - INFO - Running Random Forest Classifier

Load the model


In [20]:
training_method.load(path=CONSTANTS.io_path, title=file_name)


2017-10-29 13:04:19,866 - T-CARER - INFO - Running Random Forest Classifier

2.4.2. TensorFlow Models


In [21]:
class TrainingMethodTensorflow:    
    def __init__(self, summaries, features_names, num_features, cut_off, train_size, test_size):
        self.model_predict = {"train": {'score': [], 'model_labels': []}, 
                              "test": {'score': [], 'model_labels': []}}
        self.__stats = Stats()
        # summaries["fit"]["get_variable_names"]
        # summaries["fit"]["get_variable_value"]
        # summaries["fit"]["get_params"]
        # summaries["fit"]["export"]
        # summaries["fit"]["get_variable_names()"]
        # summaries["fit"]["params"]
        # summaries["fit"]["dnn_bias_"]
        # summaries["fit"]["dnn_weights_"] 
        # summaries["train"]["results"]
        # summaries["test"]["results"]
               
        self.model_predict["train"]['pred'] = np.asarray([1 if i[1] >= 0.5 else 0 for i in summaries["train"]["predict_proba"]][0:train_size])
        self.model_predict["test"]['pred'] = np.asarray([1 if i[1] >= 0.5 else 0 for i in summaries["test"]["predict_proba"]][0:test_size])
        
        self.model_predict["train"]['score'] =  np.asarray([i[1] for i in summaries["train"]["predict_proba"]][0:train_size])
        self.model_predict["test"]['score'] =  np.asarray([i[1] for i in summaries["test"]["predict_proba"]][0:test_size])
        
        self.model_predict["train"]['score_0'] =  np.asarray([i[0] for i in summaries["train"]["predict_proba"]][0:train_size])
        self.model_predict["test"]['score_0'] =  np.asarray([i[0] for i in summaries["test"]["predict_proba"]][0:test_size])
    
    def train_summaries(self):
        return {"feature_importances_": self.__weights}
    
    def predict_summaries(self, feature_target, sample_name):
        return self.__stats.predict_summaries(self.model_predict[sample_name], feature_target)

In [22]:
file_name = "model_tensorflow_summaries_" + target_feature
summaries = readers_writers.load_serialised_compressed(path=CONSTANTS.io_path, title=file_name)


2017-10-29 13:04:20,044 - T-CARER - ERROR - ReadersWriters._PickleSerialised - Can not open the file: 
C:\Users\eagle\Documents\GitHub\tmp\TCARER\Basic_prototype\model_tensorflow_summaries_label365.bz2

[Errno 2] No such file or directory: 'C:\\Users\\eagle\\Documents\\GitHub\\tmp\\TCARER\\Basic_prototype\\model_tensorflow_summaries_label365.bz2'
An exception has occurred, use %tb to see the full traceback.

SystemExit
c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\IPython\core\interactiveshell.py:2889: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.
  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)

In [ ]:
num_features = 300
cut_off = 0.5

training_method = TrainingMethodTensorflow(summaries, features_names_selected, num_features, cut_off,
                                           len(features["train_indep"].index), len(features["test_indep"].index))



Performance


In [23]:
# train
o_summaries = training_method.predict_summaries(features["train_target"][target_feature], "train")
for k in o_summaries.keys():
    print(k,  o_summaries[k])
    
print("\n")
# test
o_summaries = training_method.predict_summaries(features["test_target"][target_feature], "test")
for k in o_summaries.keys():
    print(k,  o_summaries[k])


accuracy_score 0.6592
average_precision_score 0.47970482114
brier_score_loss 0.218148003816
classification_report              precision    recall  f1-score   support

          0       0.88      0.63      0.74      1885
          1       0.40      0.74      0.52       615

avg / total       0.76      0.66      0.68      2500

confusion_matrix [[1194  691]
 [ 161  454]]
f1_score 0.515909090909
fbeta_score 0.436958614052
hamming_loss 0.3408
jaccard_similarity_score 0.6592
log_loss 11.7710360041
matthews_corrcoef 0.32124418171
precision_recall_fscore_support (array([ 0.88118081,  0.39650655]), array([ 0.63342175,  0.73821138]), array([ 0.73703704,  0.51590909]), array([1885,  615], dtype=int64))
precision_score 0.396506550218
recall_score 0.738211382114
roc_auc_score 0.742965646633
zero_one_loss 0.3408


accuracy_score 0.654661864746
average_precision_score 0.423030738094
brier_score_loss 0.22022287805
classification_report              precision    recall  f1-score   support

          0       0.90      0.63      0.74      1959
          1       0.36      0.75      0.48       540

avg / total       0.78      0.65      0.69      2499

confusion_matrix [[1230  729]
 [ 134  406]]
f1_score 0.484776119403
fbeta_score 0.399606299213
hamming_loss 0.345338135254
jaccard_similarity_score 0.654661864746
log_loss 11.9277898901
matthews_corrcoef 0.313889024973
precision_recall_fscore_support (array([ 0.90175953,  0.35770925]), array([ 0.62787136,  0.75185185]), array([ 0.74029491,  0.48477612]), array([1959,  540], dtype=int64))
precision_score 0.357709251101
recall_score 0.751851851852
roc_auc_score 0.73645898323
zero_one_loss 0.345338135254



2.5. Load the Extra Features for Benchmarking

Read the extra features


In [24]:
table = 'tcarer_featuresExtra'
features_extra_dtypes = {'patientID': 'U32', 'trigger_charlsonFoster': 'i4', 'trigger_los': 'i4', 'trigger_age': 'i4', 'prior_admiOther': 'i4', 'prior_admiAcute': 'i4', 
                         'prior_spells': 'i4', 'prior_asthma': 'i4', 'prior_copd': 'i4', 'prior_depression': 'i4', 'prior_diabetes': 'i4', 'prior_hypertension': 'i4', 'prior_cancer': 'i4', 'prior_chd': 'i4', 'prior_chf': 'i4', 
                         'diagCci_01_myocardial_freq': 'i4', 'diagCci_02_chf_freq': 'i4', 'diagCci_03_pvd_freq': 'i4', 'diagCci_04_cerebrovascular_freq': 'i4', 'diagCci_05_dementia_freq': 'i4', 'diagCci_06_cpd_freq': 'i4', 'diagCci_07_rheumatic_freq': 'i4', 'diagCci_08_ulcer_freq': 'i4', 'diagCci_09_liverMild_freq': 'i4', 'diagCci_10_diabetesNotChronic_freq': 'i4', 'diagCci_11_diabetesChronic_freq': 'i4', 'diagCci_12_hemiplegia_freq': 'i4', 'diagCci_13_renal_freq': 'i4', 'diagCci_14_malignancy_freq': 'i4', 'diagCci_15_liverSevere_freq': 'i4', 'diagCci_16_tumorSec_freq': 'i4', 'diagCci_17_aids_freq': 'i4', 'diagCci_18_depression_freq': 'i4', 'diagCci_19_cardiac_freq': 'i4', 'diagCci_20_valvular_freq': 'i4', 'diagCci_21_pulmonary_freq': 'i4', 'diagCci_22_vascular_freq': 'i4', 'diagCci_23_hypertensionNotComplicated_freq': 'i4', 'diagCci_24_hypertensionComplicated_freq': 'i4', 'diagCci_25_paralysis_freq': 'i4', 'diagCci_26_neuroOther_freq': 'i4', 'diagCci_27_pulmonaryChronic_freq': 'i4', 'diagCci_28_diabetesNotComplicated_freq': 'i4', 'diagCci_29_diabetesComplicated_freq': 'i4', 'diagCci_30_hypothyroidism_freq': 'i4', 'diagCci_31_renal_freq': 'i4', 'diagCci_32_liver_freq': 'i4', 'diagCci_33_ulcerNotBleeding_freq': 'i4', 'diagCci_34_psychoses_freq': 'i4', 'diagCci_35_lymphoma_freq': 'i4', 'diagCci_36_cancerSec_freq': 'i4', 'diagCci_37_tumorNotSec_freq': 'i4', 'diagCci_38_rheumatoid_freq': 'i4', 'diagCci_39_coagulopathy_freq': 'i4', 'diagCci_40_obesity_freq': 'i4', 'diagCci_41_weightLoss_freq': 'i4', 'diagCci_42_fluidDisorder_freq': 'i4', 'diagCci_43_bloodLoss_freq': 'i4', 'diagCci_44_anemia_freq': 'i4', 'diagCci_45_alcohol_freq': 'i4', 'diagCci_46_drug_freq': 'i4'}
features_extra_name = features_extra_dtypes.keys()

In [25]:
# Read features from the MySQL
features_extra = dict()
features_extra['train'] = readers_writers.load_mysql_table(schema, table, dataframing=True)
features_extra['train'].astype(dtype=features_extra_dtypes)
features_extra['test'] = features_extra['train']

print("Number of columns: ", len(features_extra['train'].columns), "; Total records: ", len(features_extra['train'].index))


---------------------------------------------------------------------------
ProgrammingError                          Traceback (most recent call last)
c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
   1181                         parameters,
-> 1182                         context)
   1183         except BaseException as e:

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\sqlalchemy\engine\default.py in do_execute(self, cursor, statement, parameters, context)
    469     def do_execute(self, cursor, statement, parameters, context=None):
--> 470         cursor.execute(statement, parameters)
    471 

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\cursors.py in execute(self, query, args)
    249             exc, value = sys.exc_info()[:2]
--> 250             self.errorhandler(self, exc, value)
    251         self._executed = query

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\connections.py in defaulterrorhandler(***failed resolving arguments***)
     41     if isinstance(errorvalue, BaseException):
---> 42         raise errorvalue
     43     if errorclass is not None:

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\cursors.py in execute(self, query, args)
    246         try:
--> 247             res = self._query(query)
    248         except Exception:

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\cursors.py in _query(self, q)
    410     def _query(self, q):
--> 411         rowcount = self._do_query(q)
    412         self._post_get_result()

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\cursors.py in _do_query(self, q)
    373         self._last_executed = q
--> 374         db.query(q)
    375         self._do_get_result()

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\connections.py in query(self, query)
    269         else:
--> 270             _mysql.connection.query(self, query)
    271 

ProgrammingError: (1146, "Table 'parr_sample_prototype.tcarer_featuresextra' doesn't exist")

The above exception was the direct cause of the following exception:

ProgrammingError                          Traceback (most recent call last)
<ipython-input-25-086a730ff34b> in <module>()
      1 # Read features from the MySQL
      2 features_extra = dict()
----> 3 features_extra['train'] = readers_writers.load_mysql_table(schema, table, dataframing=True)
      4 features_extra['train'].astype(dtype=features_extra_dtypes)
      5 features_extra['test'] = features_extra['train']

C:\Users\eagle\Documents\GitHub\Analytics_UoW\TCARER\ReadersWriters\ReadersWriters.py in load_mysql_table(db_schema, db_table, dataframing)
    323         """
    324         query = "SELECT * FROM " + db_table
--> 325         return ReadersWriters.load_mysql_query(query, db_schema, dataframing)
    326 
    327     @staticmethod

C:\Users\eagle\Documents\GitHub\Analytics_UoW\TCARER\ReadersWriters\ReadersWriters.py in load_mysql_query(query, db_schema, dataframing, batch, float_round_vars, float_round)
    345         engine = db.open()
    346         dbc = MysqlCommand(engine, db.db_session_vars)
--> 347         output = dbc.read(query, dataframing, batch, float_round_vars, float_round)
    348         db.close()
    349         return output

C:\Users\eagle\Documents\GitHub\Analytics_UoW\TCARER\ReadersWriters\_MysqlCommand.py in read(self, query, dataframing, batch, float_round_vars, float_round)
     71         self.__logger.debug("Reading from MySQL database.")
     72         if dataframing:
---> 73             result = self.__read_df(query, batch, float_round_vars, float_round)
     74         else:
     75             result = self.__read_arr(query)

C:\Users\eagle\Documents\GitHub\Analytics_UoW\TCARER\ReadersWriters\_MysqlCommand.py in __read_df(self, query, batch, float_round_vars, float_round)
    104 
    105                 if batch is None:
--> 106                     result = pds.read_sql(sql=query, con=conn, coerce_float=False, chunksize=batch)
    107                 else:
    108                     for df in pds.read_sql(sql=query, con=conn, coerce_float=False, chunksize=batch):

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\pandas\io\sql.py in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize)
    413             sql, index_col=index_col, params=params,
    414             coerce_float=coerce_float, parse_dates=parse_dates,
--> 415             chunksize=chunksize)
    416 
    417 

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\pandas\io\sql.py in read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize)
   1082         args = _convert_params(sql, params)
   1083 
-> 1084         result = self.execute(*args)
   1085         columns = result.keys()
   1086 

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
    973     def execute(self, *args, **kwargs):
    974         """Simple passthrough to SQLAlchemy connectable"""
--> 975         return self.connectable.execute(*args, **kwargs)
    976 
    977     def read_table(self, table_name, index_col=None, coerce_float=True,

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\sqlalchemy\engine\base.py in execute(self, object, *multiparams, **params)
    937         """
    938         if isinstance(object, util.string_types[0]):
--> 939             return self._execute_text(object, multiparams, params)
    940         try:
    941             meth = object._execute_on_connection

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\sqlalchemy\engine\base.py in _execute_text(self, statement, multiparams, params)
   1095             statement,
   1096             parameters,
-> 1097             statement, parameters
   1098         )
   1099         if self._has_events or self.engine._has_events:

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
   1187                 parameters,
   1188                 cursor,
-> 1189                 context)
   1190 
   1191         if self._has_events or self.engine._has_events:

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\sqlalchemy\engine\base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context)
   1391                 util.raise_from_cause(
   1392                     sqlalchemy_exception,
-> 1393                     exc_info
   1394                 )
   1395             else:

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\sqlalchemy\util\compat.py in raise_from_cause(exception, exc_info)
    201     exc_type, exc_value, exc_tb = exc_info
    202     cause = exc_value if exc_value is not exception else None
--> 203     reraise(type(exception), exception, tb=exc_tb, cause=cause)
    204 
    205 if py3k:

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\sqlalchemy\util\compat.py in reraise(tp, value, tb, cause)
    184             value.__cause__ = cause
    185         if value.__traceback__ is not tb:
--> 186             raise value.with_traceback(tb)
    187         raise value
    188 

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
   1180                         statement,
   1181                         parameters,
-> 1182                         context)
   1183         except BaseException as e:
   1184             self._handle_dbapi_exception(

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\sqlalchemy\engine\default.py in do_execute(self, cursor, statement, parameters, context)
    468 
    469     def do_execute(self, cursor, statement, parameters, context=None):
--> 470         cursor.execute(statement, parameters)
    471 
    472     def do_execute_no_params(self, cursor, statement, context=None):

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\cursors.py in execute(self, query, args)
    248         except Exception:
    249             exc, value = sys.exc_info()[:2]
--> 250             self.errorhandler(self, exc, value)
    251         self._executed = query
    252         if not self._defer_warnings:

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\connections.py in defaulterrorhandler(***failed resolving arguments***)
     40     del connection
     41     if isinstance(errorvalue, BaseException):
---> 42         raise errorvalue
     43     if errorclass is not None:
     44         raise errorclass(errorvalue)

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\cursors.py in execute(self, query, args)
    245         res = None
    246         try:
--> 247             res = self._query(query)
    248         except Exception:
    249             exc, value = sys.exc_info()[:2]

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\cursors.py in _query(self, q)
    409 
    410     def _query(self, q):
--> 411         rowcount = self._do_query(q)
    412         self._post_get_result()
    413         return rowcount

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\cursors.py in _do_query(self, q)
    372         db = self._get_db()
    373         self._last_executed = q
--> 374         db.query(q)
    375         self._do_get_result()
    376         return self.rowcount

c:\users\eagle\appdata\local\programs\python\python35\lib\site-packages\MySQLdb\connections.py in query(self, query)
    268             self.read_query_result()
    269         else:
--> 270             _mysql.connection.query(self, query)
    271 
    272     def __enter__(self):

ProgrammingError: (_mysql_exceptions.ProgrammingError) (1146, "Table 'parr_sample_prototype.tcarer_featuresextra' doesn't exist") [SQL: 'SELECT * FROM tcarer_featuresExtra']

Replace NaN appears in the Charlson-Index feature


In [ ]:
features_extra['train'].loc[:, "trigger_charlsonFoster"] = np.nan_to_num(features_extra['train']["trigger_charlsonFoster"])
features_extra['test'].loc[:, "trigger_charlsonFoster"] = np.nan_to_num(features_extra['test']["trigger_charlsonFoster"])

Combine (join by PatientID)


In [ ]:
features_extra['train'] = features_extra['train'].merge(
    pd.concat([features['train_id'], features['train_target'], 
               pd.DataFrame({'score': training_method.model_predict["train"]['score']}), features['train_indep']], axis=1), 
    how="inner", on="patientID")
features_extra['test'] = features_extra['test'].merge(
    pd.concat([features['test_id'], features['test_target'], 
               pd.DataFrame({'score': training_method.model_predict["test"]['score']}), features['test_indep']], axis=1), 
    how="inner", on="patientID")

Clean-up


In [ ]:
features = None
gc.collect()



3. Charlson Index Model

3.1. Algorithm

Algorithm 1: Random Forest


In [ ]:
charlson_method_name = "rfc"
kwargs = {"n_estimators": 20, "criterion": 'gini', "max_depth": None, "min_samples_split": 100,
    "min_samples_leaf": 50, "min_weight_fraction_leaf": 0.0, "max_features": 'auto',
    "max_leaf_nodes": None, "bootstrap": True, "oob_score": False, "n_jobs": -1, "random_state": None,
    "verbose": 0, "warm_start": False, "class_weight": "balanced_subsample"}

Algorithm 2: Logistic Regression


In [ ]:
charlson_method_name = "lr"
kwargs = {"penalty": 'l2', "dual": False, "tol": 0.0001, "C": 1, "fit_intercept": True, "intercept_scaling": 1,
          "class_weight": None, "random_state": None, "solver": 'liblinear', "max_iter": 100, "multi_class": 'ovr',
          "verbose": 0, "warm_start": False, "n_jobs": -1}



3.2. Initialise


In [ ]:
# set features
charlson_features_names = ['trigger_charlsonFoster']

In [ ]:
# select the target variable
charlson_target_feature = "label30" # "label30",  "label365" 

# file name
file_name = "report_Model_Charlson_" + charlson_method_name + "_" + charlson_target_feature

# initialise
charlson_training_method = TrainingMethod(charlson_method_name)

3.3. Fit

Fit Model


In [ ]:
o_summaries = dict()
# Fit
model = charlson_training_method.train(features_extra["train"][charlson_features_names], features_extra["train"][target_feature], **kwargs)
charlson_training_method.save_model(path=CONSTANTS.io_path, title=file_name)

In [ ]:
# load model
# charlson_training_method.load(path=CONSTANTS.io_path, title=file_name)

In [ ]:
# short summary
o_summaries = charlson_training_method.train_summaries()

Fit Performance


In [ ]:
o_summaries = dict()
model = charlson_training_method.predict(features_extra["train"][charlson_features_names], "train")

In [ ]:
# short summary
o_summaries = charlson_training_method.predict_summaries(pd.Series(features_extra["train"][target_feature]), "train")
print("ROC AUC:", o_summaries['roc_auc_score_1'], "\n", o_summaries['classification_report'])
for k in o_summaries.keys():
    print(k,  o_summaries[k])

3.4. Predict


In [ ]:
o_summaries = dict()
model = charlson_training_method.predict(features_extra["test"][charlson_features_names], "test")

In [ ]:
# short summary
o_summaries = charlson_training_method.predict_summaries(pd.Series(features_extra["test"][target_feature]), "test")
print("ROC AUC:", o_summaries['roc_auc_score_1'], "\n", o_summaries['classification_report'])
for k in o_summaries.keys():
    print(k,  o_summaries[k])

3.5. Cross-Validate


In [ ]:
o_summaries = dict()
score = charlson_training_method.cross_validate(features_extra["test"][charlson_features_names], features_extra["test"][target_feature], 
                                             scoring="neg_mean_squared_error", cv=10)

In [ ]:
# short summary
o_summaries = charlson_training_method.cross_validate_summaries()
print("Scores: ", o_summaries)

3.6. Save


In [ ]:
charlson_training_method.save_model(path=CONSTANTS.io_path, title=file_name)



4. Features Statistics

4.1. Features Rank

It is produced during modelling

4.2. Descriptive Statistics

It is produced during modelling

4.3. Features Weigths


In [ ]:
def features_importance_rank(fitting_method, ranking_file_name=None, rank_models=["rfc", "gbrt", "randLogit"]):
    # Fitting weight
    o_summaries = pd.DataFrame({"Name": fitting_method.model_labels,
                                "Fitting Weight": fitting_method.train_summaries()["feature_importances_"]},
                              index = fitting_method.model_labels)
    o_summaries = o_summaries.sort_values("Fitting Weight", ascending=False)
    o_summaries = o_summaries.reset_index(drop=True)
    
    # Ranking scores
    if ranking_file_name is not None:
        for rank_model in rank_models:
            o_summaries_ranks = readers_writers.load_serialised_compressed(
                path=CONSTANTS.io_path, title=ranking_file_name + rank_model)
            for trial in range(len(o_summaries_ranks)):
                o_summaries_rank = pd.DataFrame(o_summaries_ranks[trial])
                o_summaries_rank.columns = ["Name", "Importance - " + rank_model + " - Trial_" + str(trial),
                                            "Order - " + rank_model + " - Trial_" + str(trial)]
                o_summaries = o_summaries.merge(o_summaries_rank, how="outer", on="Name")
      
    return o_summaries

In [ ]:
file_name = "Step_07_Model_Train_model_rank_summaries_"

o_summaries = features_importance_rank(fitting_method=training_method, ranking_file_name=file_name, rank_models=rank_models)

file_name = "report_weights_ranks"
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name, data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

display(o_summaries.head())



5. Model Performance

5.1. Performance Indicators


In [ ]:
measures = ["accuracy_score", "precision_score", "recall_score",
            "roc_auc_score_1", "f1_score", "fbeta_score", "average_precision_score",  
            "log_loss", "zero_one_loss", "hamming_loss", "jaccard_similarity_score", "matthews_corrcoef"]

In [ ]:
# train
o_summaries = training_method.predict_summaries(features_extra["train"][target_feature], "train")
o_summaries = np.array([(m, o_summaries[m]) for m in measures])
report_performance = pd.DataFrame({"Measure": o_summaries[:, 0], 
                                   "Sample Train": o_summaries[:, 1], 
                                   "Sample Test": [None] * len(measures)})

# test
o_summaries = training_method.predict_summaries(features_extra["test"][target_feature], "test")
o_summaries = np.array([(m, o_summaries[m]) for m in measures])
report_performance["Sample Test"] = o_summaries[:, 1]

In [ ]:
# print
file_name = "report_performance_" + method_name + "_" + target_feature
display(report_performance)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name, data=report_performance, append=False)

5.2. Population Statistics


In [ ]:
def population_statistics(df, diagnoses, cutpoints=[0.50, 0.60, 0.70, 0.80, 0.90]):
    o_summaries = pd.DataFrame(columns=['Name'], index=diagnoses)
    o_summaries['Name'] = diagnoses
    
    for diagnose in diagnoses:
        o_summaries.loc[diagnose, 'Total'] = len(df.index)
        if diagnose not in df:
            continue
            
        o_summaries.loc[diagnose, 'Total - diagnose'] = len(df.loc[(df[diagnose] > 0)].index)
        o_summaries.loc[diagnose, 'Total - diagnose - label_1'] = len(df.loc[(df[diagnose] > 0) & (df[target_feature] > 0)].index)
        o_summaries.loc[diagnose, 'Emergency Readmission Rate - cnt 1'] = len(df.loc[(df[diagnose] > 0) & (df['admimeth_0t30d_prevalence_1_cnt'] > 0) & (df[target_feature] > 0)].index)
        o_summaries.loc[diagnose, 'Emergency Readmission Rate - cnt 2'] = len(df.loc[(df[diagnose] > 0) & (df['admimeth_0t30d_prevalence_2_cnt'] > 0) & (df[target_feature] > 0)].index)
        o_summaries.loc[diagnose, 'Emergency Readmission Rate - cnt 3'] = len(df.loc[(df[diagnose] > 0) & (df['admimeth_0t30d_prevalence_3_cnt'] > 0) & (df[target_feature] > 0)].index)
        o_summaries.loc[diagnose, 'Prior Spells'] = len(df.loc[(df[diagnose] > 0) & (df['prior_spells'] > 0) & (df[target_feature] > 0)].index)
        o_summaries.loc[diagnose, 'Male - perc'] = len(df.loc[(df[diagnose] > 0) & (df['gender_1'] > 0) & (df[target_feature] > 0)].index)
        age = df.loc[(df[diagnose] > 0) & (df[target_feature] > 0)]['trigger_age'].describe(percentiles=[.25, .5, .75])
        o_summaries.loc[diagnose, 'Age - IQR_min'] = age['min']
        o_summaries.loc[diagnose, 'Age - IQR_25'] = age['25%']
        o_summaries.loc[diagnose, 'Age - IQR_50'] = age['50%']
        o_summaries.loc[diagnose, 'Age - IQR_75'] = age['75%']
        o_summaries.loc[diagnose, 'Age - IQR_max'] = age['max']
        los = df.loc[(df[diagnose] > 0) & (df[target_feature] > 0)]['trigger_los'].describe(percentiles=[.25, .5, .75])
        o_summaries.loc[diagnose, 'LoS - IQR_min'] = los['min']
        o_summaries.loc[diagnose, 'LoS - IQR_25'] = los['25%']
        o_summaries.loc[diagnose, 'LoS - IQR_50'] = los['50%']
        o_summaries.loc[diagnose, 'LoS - IQR_75'] = los['75%']
        o_summaries.loc[diagnose, 'LoS - IQR_max'] = los['max']
        for cutpoint in cutpoints:
            o_summaries.loc[diagnose, 'score - ' + str(cutpoint)] = len(df.loc[(df[diagnose] > 0) & (df['score'] > cutpoint)].index)
            o_summaries.loc[diagnose, 'TP - ' + str(cutpoint)] = len(df.loc[(df[diagnose] > 0) & (df[target_feature] > 0) & (df['score'] > cutpoint)].index)
            o_summaries.loc[diagnose, 'FP - ' + str(cutpoint)] = len(df.loc[(df[diagnose] > 0) & (df[target_feature] == 0) & (df['score'] > cutpoint)].index)
            o_summaries.loc[diagnose, 'FN - ' + str(cutpoint)] = len(df.loc[(df[diagnose] > 0) & (df[target_feature] > 0) & (df['score'] <= cutpoint)].index)
            o_summaries.loc[diagnose, 'TN - ' + str(cutpoint)] = len(df.loc[(df[diagnose] > 0) & (df[target_feature] == 0) & (df['score'] <= cutpoint)].index)
            
        
        o_summaries.loc[diagnose, 'Charlson - 0'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 0)].index)
        o_summaries.loc[diagnose, 'Charlson - 0 - label_1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 0) & (df[target_feature] > 0)].index)
        o_summaries.loc[diagnose, 'Charlson - 1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 1)].index)
        o_summaries.loc[diagnose, 'Charlson - 1 - label_1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 1) & (df[target_feature] > 0)].index)
        o_summaries.loc[diagnose, 'Charlson - 2'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 2)].index)
        o_summaries.loc[diagnose, 'Charlson - 2 - label_1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 2) & (df[target_feature] > 0)].index)
        o_summaries.loc[diagnose, 'Charlson - 3'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 3)].index)
        o_summaries.loc[diagnose, 'Charlson - 3 - label_1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 3) & (df[target_feature] > 0)].index)
        o_summaries.loc[diagnose, 'Charlson - 4+'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] >= 4)].index)
        o_summaries.loc[diagnose, 'Charlson - 4+ - label_1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] >= 4) & (df[target_feature] > 0)].index)
        
        for cutpoint in cutpoints:
            o_summaries.loc[diagnose, 'Charlson - 0 - label_1 - TP - ' + str(cutpoint)] = \
                len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 0) & (df[target_feature] > 0) & (df['score'] > cutpoint)].index)
        
    return o_summaries

5.2.1. Most Prevalent Diagnoses Groups

Most prevalent diagnoses groups (30-day, 1-year readmission):

  • Total, Admissions, Emergency Readmission Rate, Prior Spells, Male (%), Age (IQR), LoS (IQR), TP, FP, FN, TN

In [ ]:
diagnoses = ['diagCCS_0t30d_others_cnt', 'diagCCS_0t30d_prevalence_1_cnt', 'diagCCS_0t30d_prevalence_2_cnt', 'diagCCS_0t30d_prevalence_3_cnt', 'diagCCS_0t30d_prevalence_4_cnt', 'diagCCS_0t30d_prevalence_5_cnt', 'diagCCS_0t30d_prevalence_6_cnt', 'diagCCS_0t30d_prevalence_7_cnt', 'diagCCS_0t30d_prevalence_8_cnt', 'diagCCS_0t30d_prevalence_9_cnt', 'diagCCS_0t30d_prevalence_10_cnt', 'diagCCS_0t30d_prevalence_11_cnt', 'diagCCS_0t30d_prevalence_12_cnt', 'diagCCS_0t30d_prevalence_13_cnt', 'diagCCS_0t30d_prevalence_14_cnt', 'diagCCS_0t30d_prevalence_15_cnt', 'diagCCS_0t30d_prevalence_16_cnt', 'diagCCS_0t30d_prevalence_17_cnt', 'diagCCS_0t30d_prevalence_18_cnt', 'diagCCS_0t30d_prevalence_19_cnt', 'diagCCS_0t30d_prevalence_20_cnt', 'diagCCS_0t30d_prevalence_21_cnt', 'diagCCS_0t30d_prevalence_22_cnt', 'diagCCS_0t30d_prevalence_23_cnt', 'diagCCS_0t30d_prevalence_24_cnt', 'diagCCS_0t30d_prevalence_25_cnt', 'diagCCS_0t30d_prevalence_26_cnt', 'diagCCS_0t30d_prevalence_27_cnt', 'diagCCS_0t30d_prevalence_28_cnt', 'diagCCS_0t30d_prevalence_29_cnt', 'diagCCS_0t30d_prevalence_30_cnt'
            , 'diagCCS_30t90d_others_cnt', 'diagCCS_30t90d_prevalence_1_cnt', 'diagCCS_30t90d_prevalence_2_cnt', 'diagCCS_30t90d_prevalence_3_cnt', 'diagCCS_30t90d_prevalence_4_cnt', 'diagCCS_30t90d_prevalence_5_cnt', 'diagCCS_30t90d_prevalence_6_cnt', 'diagCCS_30t90d_prevalence_7_cnt', 'diagCCS_30t90d_prevalence_8_cnt', 'diagCCS_30t90d_prevalence_9_cnt', 'diagCCS_30t90d_prevalence_10_cnt', 'diagCCS_30t90d_prevalence_11_cnt', 'diagCCS_30t90d_prevalence_12_cnt', 'diagCCS_30t90d_prevalence_13_cnt', 'diagCCS_30t90d_prevalence_14_cnt', 'diagCCS_30t90d_prevalence_15_cnt', 'diagCCS_30t90d_prevalence_16_cnt', 'diagCCS_30t90d_prevalence_17_cnt', 'diagCCS_30t90d_prevalence_18_cnt', 'diagCCS_30t90d_prevalence_19_cnt', 'diagCCS_30t90d_prevalence_20_cnt', 'diagCCS_30t90d_prevalence_21_cnt', 'diagCCS_30t90d_prevalence_22_cnt', 'diagCCS_30t90d_prevalence_23_cnt', 'diagCCS_30t90d_prevalence_24_cnt', 'diagCCS_30t90d_prevalence_25_cnt', 'diagCCS_30t90d_prevalence_26_cnt', 'diagCCS_30t90d_prevalence_27_cnt', 'diagCCS_30t90d_prevalence_28_cnt', 'diagCCS_30t90d_prevalence_29_cnt', 'diagCCS_30t90d_prevalence_30_cnt'
            , 'diagCCS_90t180d_others_cnt', 'diagCCS_90t180d_prevalence_1_cnt', 'diagCCS_90t180d_prevalence_2_cnt', 'diagCCS_90t180d_prevalence_3_cnt', 'diagCCS_90t180d_prevalence_4_cnt', 'diagCCS_90t180d_prevalence_5_cnt', 'diagCCS_90t180d_prevalence_6_cnt', 'diagCCS_90t180d_prevalence_7_cnt', 'diagCCS_90t180d_prevalence_8_cnt', 'diagCCS_90t180d_prevalence_9_cnt', 'diagCCS_90t180d_prevalence_10_cnt', 'diagCCS_90t180d_prevalence_11_cnt', 'diagCCS_90t180d_prevalence_12_cnt', 'diagCCS_90t180d_prevalence_13_cnt', 'diagCCS_90t180d_prevalence_14_cnt', 'diagCCS_90t180d_prevalence_15_cnt', 'diagCCS_90t180d_prevalence_16_cnt', 'diagCCS_90t180d_prevalence_17_cnt', 'diagCCS_90t180d_prevalence_18_cnt', 'diagCCS_90t180d_prevalence_19_cnt', 'diagCCS_90t180d_prevalence_20_cnt', 'diagCCS_90t180d_prevalence_21_cnt', 'diagCCS_90t180d_prevalence_22_cnt', 'diagCCS_90t180d_prevalence_23_cnt', 'diagCCS_90t180d_prevalence_24_cnt', 'diagCCS_90t180d_prevalence_25_cnt', 'diagCCS_90t180d_prevalence_26_cnt', 'diagCCS_90t180d_prevalence_27_cnt', 'diagCCS_90t180d_prevalence_28_cnt', 'diagCCS_90t180d_prevalence_29_cnt', 'diagCCS_90t180d_prevalence_30_cnt'
            , 'diagCCS_180t365d_others_cnt', 'diagCCS_180t365d_prevalence_1_cnt', 'diagCCS_180t365d_prevalence_2_cnt', 'diagCCS_180t365d_prevalence_3_cnt', 'diagCCS_180t365d_prevalence_4_cnt', 'diagCCS_180t365d_prevalence_5_cnt', 'diagCCS_180t365d_prevalence_6_cnt', 'diagCCS_180t365d_prevalence_7_cnt', 'diagCCS_180t365d_prevalence_8_cnt', 'diagCCS_180t365d_prevalence_9_cnt', 'diagCCS_180t365d_prevalence_10_cnt', 'diagCCS_180t365d_prevalence_11_cnt', 'diagCCS_180t365d_prevalence_12_cnt', 'diagCCS_180t365d_prevalence_13_cnt', 'diagCCS_180t365d_prevalence_14_cnt', 'diagCCS_180t365d_prevalence_15_cnt', 'diagCCS_180t365d_prevalence_16_cnt', 'diagCCS_180t365d_prevalence_17_cnt', 'diagCCS_180t365d_prevalence_18_cnt', 'diagCCS_180t365d_prevalence_19_cnt', 'diagCCS_180t365d_prevalence_20_cnt', 'diagCCS_180t365d_prevalence_21_cnt', 'diagCCS_180t365d_prevalence_22_cnt', 'diagCCS_180t365d_prevalence_23_cnt', 'diagCCS_180t365d_prevalence_24_cnt', 'diagCCS_180t365d_prevalence_25_cnt', 'diagCCS_180t365d_prevalence_26_cnt', 'diagCCS_180t365d_prevalence_27_cnt', 'diagCCS_180t365d_prevalence_28_cnt', 'diagCCS_180t365d_prevalence_29_cnt', 'diagCCS_180t365d_prevalence_30_cnt'
            , 'diagCCS_365t730d_others_cnt', 'diagCCS_365t730d_prevalence_1_cnt', 'diagCCS_365t730d_prevalence_2_cnt', 'diagCCS_365t730d_prevalence_3_cnt', 'diagCCS_365t730d_prevalence_4_cnt', 'diagCCS_365t730d_prevalence_5_cnt', 'diagCCS_365t730d_prevalence_6_cnt', 'diagCCS_365t730d_prevalence_7_cnt', 'diagCCS_365t730d_prevalence_8_cnt', 'diagCCS_365t730d_prevalence_9_cnt', 'diagCCS_365t730d_prevalence_10_cnt', 'diagCCS_365t730d_prevalence_11_cnt', 'diagCCS_365t730d_prevalence_12_cnt', 'diagCCS_365t730d_prevalence_13_cnt', 'diagCCS_365t730d_prevalence_14_cnt', 'diagCCS_365t730d_prevalence_15_cnt', 'diagCCS_365t730d_prevalence_16_cnt', 'diagCCS_365t730d_prevalence_17_cnt', 'diagCCS_365t730d_prevalence_18_cnt', 'diagCCS_365t730d_prevalence_19_cnt', 'diagCCS_365t730d_prevalence_20_cnt', 'diagCCS_365t730d_prevalence_21_cnt', 'diagCCS_365t730d_prevalence_22_cnt', 'diagCCS_365t730d_prevalence_23_cnt', 'diagCCS_365t730d_prevalence_24_cnt', 'diagCCS_365t730d_prevalence_25_cnt', 'diagCCS_365t730d_prevalence_26_cnt', 'diagCCS_365t730d_prevalence_27_cnt', 'diagCCS_365t730d_prevalence_28_cnt', 'diagCCS_365t730d_prevalence_29_cnt', 'diagCCS_365t730d_prevalence_30_cnt']
file_name = "report_population_prevalent_diagnoses_" + method_name + "_" + target_feature + "_"

In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

5.2.2. Major Comorbidity Groups

Comorbidity diagnoses groups (30-day, 1-year readmission):

  • Total, Admissions, Emergency Readmission Rate, Prior Spells, Male (%), Age (IQR), LoS (IQR), TP, FP, FN, TN

In [ ]:
diagnoses = ['prior_admiOther', 'prior_admiAcute', 'prior_spells', 'prior_asthma', 'prior_copd', 'prior_depression', 'prior_diabetes', 'prior_hypertension', 'prior_cancer', 'prior_chd', 'prior_chf']
file_name = "report_population_comorbidity_diagnoses_" + method_name + "_" + target_feature + "_"

In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

5.2.3. Charlson Comorbidity Groups

Charlson diagnoses groups (30-day, 1-year readmission):

  • Total, Admissions, Emergency Readmission Rate, Prior Spells, Male (%), Age (IQR), LoS (IQR), TP, FP, FN, TN

In [ ]:
diagnoses = ['diagCci_01_myocardial_freq', 'diagCci_02_chf_freq', 'diagCci_03_pvd_freq', 'diagCci_04_cerebrovascular_freq', 'diagCci_05_dementia_freq', 'diagCci_06_cpd_freq', 'diagCci_07_rheumatic_freq', 'diagCci_08_ulcer_freq', 'diagCci_09_liverMild_freq', 'diagCci_10_diabetesNotChronic_freq', 'diagCci_11_diabetesChronic_freq', 'diagCci_12_hemiplegia_freq', 'diagCci_13_renal_freq', 'diagCci_14_malignancy_freq', 'diagCci_15_liverSevere_freq', 'diagCci_16_tumorSec_freq', 'diagCci_17_aids_freq', 'diagCci_18_depression_freq', 'diagCci_19_cardiac_freq', 'diagCci_20_valvular_freq', 'diagCci_21_pulmonary_freq', 'diagCci_22_vascular_freq', 'diagCci_23_hypertensionNotComplicated_freq', 'diagCci_24_hypertensionComplicated_freq', 'diagCci_25_paralysis_freq', 'diagCci_26_neuroOther_freq', 'diagCci_27_pulmonaryChronic_freq', 'diagCci_28_diabetesNotComplicated_freq', 'diagCci_29_diabetesComplicated_freq', 'diagCci_30_hypothyroidism_freq', 'diagCci_31_renal_freq', 'diagCci_32_liver_freq', 'diagCci_33_ulcerNotBleeding_freq', 'diagCci_34_psychoses_freq', 'diagCci_35_lymphoma_freq', 'diagCci_36_cancerSec_freq', 'diagCci_37_tumorNotSec_freq', 'diagCci_38_rheumatoid_freq', 'diagCci_39_coagulopathy_freq', 'diagCci_40_obesity_freq', 'diagCci_41_weightLoss_freq', 'diagCci_42_fluidDisorder_freq', 'diagCci_43_bloodLoss_freq', 'diagCci_44_anemia_freq', 'diagCci_45_alcohol_freq', 'diagCci_46_drug_freq']
file_name = "report_population_charlson_diagnoses_" + method_name + "_" + target_feature + "_"

In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

5.2.4. Most Prevalent Operatons

Most prevalent operations variables (30-day, 1-year readmission):

  • Total, Admissions, Emergency Readmission Rate, Prior Spells, Male (%), Age (IQR), LoS (IQR), TP, FP, FN, TN

In [ ]:
diagnoses = ['operOPCSL1_0t30d_others_cnt', 'operOPCSL1_0t30d_prevalence_1_cnt', 'operOPCSL1_0t30d_prevalence_2_cnt', 'operOPCSL1_0t30d_prevalence_3_cnt', 'operOPCSL1_0t30d_prevalence_4_cnt', 'operOPCSL1_0t30d_prevalence_5_cnt', 'operOPCSL1_0t30d_prevalence_6_cnt', 'operOPCSL1_0t30d_prevalence_7_cnt', 'operOPCSL1_0t30d_prevalence_8_cnt', 'operOPCSL1_0t30d_prevalence_9_cnt', 'operOPCSL1_0t30d_prevalence_10_cnt', 'operOPCSL1_0t30d_prevalence_11_cnt', 'operOPCSL1_0t30d_prevalence_12_cnt', 'operOPCSL1_0t30d_prevalence_13_cnt', 'operOPCSL1_0t30d_prevalence_14_cnt', 'operOPCSL1_0t30d_prevalence_15_cnt', 'operOPCSL1_0t30d_prevalence_16_cnt', 'operOPCSL1_0t30d_prevalence_17_cnt', 'operOPCSL1_0t30d_prevalence_18_cnt', 'operOPCSL1_0t30d_prevalence_19_cnt', 'operOPCSL1_0t30d_prevalence_20_cnt', 'operOPCSL1_0t30d_prevalence_21_cnt', 'operOPCSL1_0t30d_prevalence_22_cnt', 'operOPCSL1_0t30d_prevalence_23_cnt', 'operOPCSL1_0t30d_prevalence_24_cnt', 'operOPCSL1_0t30d_prevalence_25_cnt', 'operOPCSL1_0t30d_prevalence_26_cnt', 'operOPCSL1_0t30d_prevalence_27_cnt', 'operOPCSL1_0t30d_prevalence_28_cnt', 'operOPCSL1_0t30d_prevalence_29_cnt', 'operOPCSL1_0t30d_prevalence_30_cnt'
            , 'operOPCSL1_30t90d_others_cnt', 'operOPCSL1_30t90d_prevalence_1_cnt', 'operOPCSL1_30t90d_prevalence_2_cnt', 'operOPCSL1_30t90d_prevalence_3_cnt', 'operOPCSL1_30t90d_prevalence_4_cnt', 'operOPCSL1_30t90d_prevalence_5_cnt', 'operOPCSL1_30t90d_prevalence_6_cnt', 'operOPCSL1_30t90d_prevalence_7_cnt', 'operOPCSL1_30t90d_prevalence_8_cnt', 'operOPCSL1_30t90d_prevalence_9_cnt', 'operOPCSL1_30t90d_prevalence_10_cnt', 'operOPCSL1_30t90d_prevalence_11_cnt', 'operOPCSL1_30t90d_prevalence_12_cnt', 'operOPCSL1_30t90d_prevalence_13_cnt', 'operOPCSL1_30t90d_prevalence_14_cnt', 'operOPCSL1_30t90d_prevalence_15_cnt', 'operOPCSL1_30t90d_prevalence_16_cnt', 'operOPCSL1_30t90d_prevalence_17_cnt', 'operOPCSL1_30t90d_prevalence_18_cnt', 'operOPCSL1_30t90d_prevalence_19_cnt', 'operOPCSL1_30t90d_prevalence_20_cnt', 'operOPCSL1_30t90d_prevalence_21_cnt', 'operOPCSL1_30t90d_prevalence_22_cnt', 'operOPCSL1_30t90d_prevalence_23_cnt', 'operOPCSL1_30t90d_prevalence_24_cnt', 'operOPCSL1_30t90d_prevalence_25_cnt', 'operOPCSL1_30t90d_prevalence_26_cnt', 'operOPCSL1_30t90d_prevalence_27_cnt', 'operOPCSL1_30t90d_prevalence_28_cnt', 'operOPCSL1_30t90d_prevalence_29_cnt', 'operOPCSL1_30t90d_prevalence_30_cnt'
            , 'operOPCSL1_90t180d_others_cnt', 'operOPCSL1_90t180d_prevalence_1_cnt', 'operOPCSL1_90t180d_prevalence_2_cnt', 'operOPCSL1_90t180d_prevalence_3_cnt', 'operOPCSL1_90t180d_prevalence_4_cnt', 'operOPCSL1_90t180d_prevalence_5_cnt', 'operOPCSL1_90t180d_prevalence_6_cnt', 'operOPCSL1_90t180d_prevalence_7_cnt', 'operOPCSL1_90t180d_prevalence_8_cnt', 'operOPCSL1_90t180d_prevalence_9_cnt', 'operOPCSL1_90t180d_prevalence_10_cnt', 'operOPCSL1_90t180d_prevalence_11_cnt', 'operOPCSL1_90t180d_prevalence_12_cnt', 'operOPCSL1_90t180d_prevalence_13_cnt', 'operOPCSL1_90t180d_prevalence_14_cnt', 'operOPCSL1_90t180d_prevalence_15_cnt', 'operOPCSL1_90t180d_prevalence_16_cnt', 'operOPCSL1_90t180d_prevalence_17_cnt', 'operOPCSL1_90t180d_prevalence_18_cnt', 'operOPCSL1_90t180d_prevalence_19_cnt', 'operOPCSL1_90t180d_prevalence_20_cnt', 'operOPCSL1_90t180d_prevalence_21_cnt', 'operOPCSL1_90t180d_prevalence_22_cnt', 'operOPCSL1_90t180d_prevalence_23_cnt', 'operOPCSL1_90t180d_prevalence_24_cnt', 'operOPCSL1_90t180d_prevalence_25_cnt', 'operOPCSL1_90t180d_prevalence_26_cnt', 'operOPCSL1_90t180d_prevalence_27_cnt', 'operOPCSL1_90t180d_prevalence_28_cnt', 'operOPCSL1_90t180d_prevalence_29_cnt', 'operOPCSL1_90t180d_prevalence_30_cnt'
            , 'operOPCSL1_180t365d_others_cnt', 'operOPCSL1_180t365d_prevalence_1_cnt', 'operOPCSL1_180t365d_prevalence_2_cnt', 'operOPCSL1_180t365d_prevalence_3_cnt', 'operOPCSL1_180t365d_prevalence_4_cnt', 'operOPCSL1_180t365d_prevalence_5_cnt', 'operOPCSL1_180t365d_prevalence_6_cnt', 'operOPCSL1_180t365d_prevalence_7_cnt', 'operOPCSL1_180t365d_prevalence_8_cnt', 'operOPCSL1_180t365d_prevalence_9_cnt', 'operOPCSL1_180t365d_prevalence_10_cnt', 'operOPCSL1_180t365d_prevalence_11_cnt', 'operOPCSL1_180t365d_prevalence_12_cnt', 'operOPCSL1_180t365d_prevalence_13_cnt', 'operOPCSL1_180t365d_prevalence_14_cnt', 'operOPCSL1_180t365d_prevalence_15_cnt', 'operOPCSL1_180t365d_prevalence_16_cnt', 'operOPCSL1_180t365d_prevalence_17_cnt', 'operOPCSL1_180t365d_prevalence_18_cnt', 'operOPCSL1_180t365d_prevalence_19_cnt', 'operOPCSL1_180t365d_prevalence_20_cnt', 'operOPCSL1_180t365d_prevalence_21_cnt', 'operOPCSL1_180t365d_prevalence_22_cnt', 'operOPCSL1_180t365d_prevalence_23_cnt', 'operOPCSL1_180t365d_prevalence_24_cnt', 'operOPCSL1_180t365d_prevalence_25_cnt', 'operOPCSL1_180t365d_prevalence_26_cnt', 'operOPCSL1_180t365d_prevalence_27_cnt', 'operOPCSL1_180t365d_prevalence_28_cnt', 'operOPCSL1_180t365d_prevalence_29_cnt', 'operOPCSL1_180t365d_prevalence_30_cnt'
            , 'operOPCSL1_365t730d_others_cnt', 'operOPCSL1_365t730d_prevalence_1_cnt', 'operOPCSL1_365t730d_prevalence_2_cnt', 'operOPCSL1_365t730d_prevalence_3_cnt', 'operOPCSL1_365t730d_prevalence_4_cnt', 'operOPCSL1_365t730d_prevalence_5_cnt', 'operOPCSL1_365t730d_prevalence_6_cnt', 'operOPCSL1_365t730d_prevalence_7_cnt', 'operOPCSL1_365t730d_prevalence_8_cnt', 'operOPCSL1_365t730d_prevalence_9_cnt', 'operOPCSL1_365t730d_prevalence_10_cnt', 'operOPCSL1_365t730d_prevalence_11_cnt', 'operOPCSL1_365t730d_prevalence_12_cnt', 'operOPCSL1_365t730d_prevalence_13_cnt', 'operOPCSL1_365t730d_prevalence_14_cnt', 'operOPCSL1_365t730d_prevalence_15_cnt', 'operOPCSL1_365t730d_prevalence_16_cnt', 'operOPCSL1_365t730d_prevalence_17_cnt', 'operOPCSL1_365t730d_prevalence_18_cnt', 'operOPCSL1_365t730d_prevalence_19_cnt', 'operOPCSL1_365t730d_prevalence_20_cnt', 'operOPCSL1_365t730d_prevalence_21_cnt', 'operOPCSL1_365t730d_prevalence_22_cnt', 'operOPCSL1_365t730d_prevalence_23_cnt', 'operOPCSL1_365t730d_prevalence_24_cnt', 'operOPCSL1_365t730d_prevalence_25_cnt', 'operOPCSL1_365t730d_prevalence_26_cnt', 'operOPCSL1_365t730d_prevalence_27_cnt', 'operOPCSL1_365t730d_prevalence_28_cnt', 'operOPCSL1_365t730d_prevalence_29_cnt', 'operOPCSL1_365t730d_prevalence_30_cnt']
file_name = "report_population_operations_" + method_name + "_" + target_feature + "_"

In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

5.2.4. Most Prevalent Main Speciality

Most prevalent operations variables (30-day, 1-year readmission):

  • Total, Admissions, Emergency Readmission Rate, Prior Spells, Male (%), Age (IQR), LoS (IQR), TP, FP, FN, TN

In [ ]:
diagnoses = ['mainspef_0t30d_others_cnt', 'mainspef_0t30d_prevalence_1_cnt', 'mainspef_0t30d_prevalence_2_cnt', 'mainspef_0t30d_prevalence_3_cnt', 'mainspef_0t30d_prevalence_4_cnt', 'mainspef_0t30d_prevalence_5_cnt', 'mainspef_0t30d_prevalence_6_cnt', 'mainspef_0t30d_prevalence_7_cnt', 'mainspef_0t30d_prevalence_8_cnt', 'mainspef_0t30d_prevalence_9_cnt', 'mainspef_0t30d_prevalence_10_cnt'
            , 'mainspef_30t90d_others_cnt', 'mainspef_30t90d_prevalence_1_cnt', 'mainspef_30t90d_prevalence_2_cnt', 'mainspef_30t90d_prevalence_3_cnt', 'mainspef_30t90d_prevalence_4_cnt', 'mainspef_30t90d_prevalence_5_cnt', 'mainspef_30t90d_prevalence_6_cnt', 'mainspef_30t90d_prevalence_7_cnt', 'mainspef_30t90d_prevalence_8_cnt', 'mainspef_30t90d_prevalence_9_cnt', 'mainspef_30t90d_prevalence_10_cnt'
            , 'mainspef_90t180d_others_cnt', 'mainspef_90t180d_prevalence_1_cnt', 'mainspef_90t180d_prevalence_2_cnt', 'mainspef_90t180d_prevalence_3_cnt', 'mainspef_90t180d_prevalence_4_cnt', 'mainspef_90t180d_prevalence_5_cnt', 'mainspef_90t180d_prevalence_6_cnt', 'mainspef_90t180d_prevalence_7_cnt', 'mainspef_90t180d_prevalence_8_cnt', 'mainspef_90t180d_prevalence_9_cnt', 'mainspef_90t180d_prevalence_10_cnt'
            , 'mainspef_180t365d_others_cnt', 'mainspef_180t365d_prevalence_1_cnt', 'mainspef_180t365d_prevalence_2_cnt', 'mainspef_180t365d_prevalence_3_cnt', 'mainspef_180t365d_prevalence_4_cnt', 'mainspef_180t365d_prevalence_5_cnt', 'mainspef_180t365d_prevalence_6_cnt', 'mainspef_180t365d_prevalence_7_cnt', 'mainspef_180t365d_prevalence_8_cnt', 'mainspef_180t365d_prevalence_9_cnt', 'mainspef_180t365d_prevalence_10_cnt'
            , 'mainspef_365t730d_others_cnt', 'mainspef_365t730d_prevalence_1_cnt', 'mainspef_365t730d_prevalence_2_cnt', 'mainspef_365t730d_prevalence_3_cnt', 'mainspef_365t730d_prevalence_4_cnt', 'mainspef_365t730d_prevalence_5_cnt', 'mainspef_365t730d_prevalence_6_cnt', 'mainspef_365t730d_prevalence_7_cnt', 'mainspef_365t730d_prevalence_8_cnt', 'mainspef_365t730d_prevalence_9_cnt', 'mainspef_365t730d_prevalence_10_cnt']
file_name = "report_population_operations_" + method_name + "_" + target_feature + "_"

In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

5.2.5. Other Variables

Other variables (30-day, 1-year readmission):

  • Total, Admissions, Emergency Readmission Rate, Prior Spells, Male (%), Age (IQR), LoS (IQR), TP, FP, FN, TN

In [ ]:
diagnoses = ['gapDays_0t30d_avg', 'gapDays_30t90d_avg', 'gapDays_90t180d_avg', 'gapDays_180t365d_avg', 'gapDays_365t730d_avg', 
             'epidur_0t30d_avg', 'epidur_30t90d_avg', 'epidur_90t180d_avg', 'epidur_180t365d_avg', 'epidur_365t730d_avg', 
             'preopdur_0t30d_avg', 'preopdur_30t90d_avg', 'preopdur_90t180d_avg', 'preopdur_180t365d_avg', 'preopdur_365t730d_avg', 
             'posopdur_0t30d_avg', 'posopdur_30t90d_avg', 'posopdur_90t180d_avg', 'posopdur_180t365d_avg', 'posopdur_365t730d_avg']
file_name = "report_population_other_variables_" + method_name + "_" + target_feature + "_"

In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)

o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)



5.3. Plots


In [ ]:
file_name = "report_population_" + method_name + "_" + target_feature + "_"

5.3.1. ROC


In [ ]:
fig, summaries = plots.roc(training_method.model_predict["test"], features_extra["test"][target_feature], 
                           title="ROC Curve", lw=2)
display(fig)

In [ ]:
# save
plt.savefig(os.path.join(CONSTANTS.io_path, file_name + "_roc" + ".pdf"), 
            dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format="pdf",
            transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None)

5.3.2. Precision Recall


In [ ]:
fig, summaries = plots.precision_recall(training_method.model_predict["test"], 
                                        features_extra["test"][target_feature], 
                                        title="Precision-Recall Curve", lw=2)
display(fig)

In [ ]:
# save
plt.savefig(os.path.join(CONSTANTS.io_path, file_name + "_precision_recall" + ".pdf"), 
            dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format="pdf",
            transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None)

5.3.3. Learning Curve


In [ ]:
fig, summaries = plots.learning_curve(training_method.model_train, 
                                      features_extra["test"][features_names_selected], 
                                      features_extra["test"][target_feature],
                                      title="Learning Curve", ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5))
display(fig)

In [ ]:
# save
plt.savefig(os.path.join(CONSTANTS.io_path, file_name + "_learning_curve" + ".pdf"), 
            dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format="pdf",
            transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None)

5.3.4. Validation Curve

Set the model's metadata


In [ ]:
# method metadata
if method_name == "lr":
    param_name = "clf__C"
    param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
elif method_name == "rfc":
    param_name = "max_features"
    param_range = range(1, 4, 1) # range(1, 20, 1)
elif method_name == "nn":
    param_name = "alpha"
    param_range = range(1e4, 1e6, 9e4)

In [ ]:
fig, summaries = plots.validation_curve(training_method.model_train, 
                                        features_extra["test"][features_names_selected], 
                                        features_extra["test"][target_feature],
                                        param_name, param_range, 
                                        title="Learning Curve", ylim=None, cv=None, lw=2, n_jobs=-1)
display(fig)

In [ ]:
# save
plt.savefig(os.path.join(CONSTANTS.io_path, file_name + "_validation_curve" + ".pdf"), 
            dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format="pdf",
            transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None)



Fin!