In [1]:
# reload modules
# Reload all modules (except those excluded by %aimport) every time before executing the Python code typed.
%load_ext autoreload
%autoreload 2
In [2]:
# import libraries
import logging
import os
import sys
import gc
import pandas as pd
import numpy as np
import random
import statistics
from datetime import datetime
from collections import OrderedDict
from sklearn import preprocessing
from scipy.stats import stats
from IPython.display import display, HTML
from pprint import pprint
from pivottablejs import pivot_ui
from IPython.display import clear_output
import imblearn.over_sampling as oversampling
import matplotlib.pyplot as plt
In [4]:
# import local classes
from Configs.CONSTANTS import CONSTANTS
from Configs.Logger import Logger
from Features.Variables import Variables
from ReadersWriters.ReadersWriters import ReadersWriters
from Stats.PreProcess import PreProcess
from Stats.FeatureSelection import FeatureSelection
from Stats.TrainingMethod import TrainingMethod
from Stats.Plots import Plots
from Stats.Stats import Stats
In [5]:
# Check the interpreter
print("\nMake sure the correct Python interpreter is used!")
print(sys.version)
print("\nMake sure sys.path of the Python interpreter is correct!")
print(os.getcwd())
In [6]:
# init paths & directories
config_path = os.path.abspath("ConfigInputs/CONFIGURATIONS.ini")
io_path = os.path.abspath("../../tmp/TCARER/Basic_prototype")
schema = "parr_sample_prototype"
app_name = "T-CARER"
print("Output path:", io_path)
In [7]:
# init logs
if not os.path.exists(io_path):
os.makedirs(io_path, exist_ok=True)
logger = Logger(path=io_path, app_name=app_name, ext="log")
logger = logging.getLogger(app_name)
In [8]:
# init constants
CONSTANTS.set(io_path, app_name)
In [9]:
# initialise other classes
readers_writers = ReadersWriters()
plots = Plots()
In [10]:
# other Constant variables
submodel_name = "hesIp"
submodel_input_name = "tcarer_model_features_ip"
In [11]:
# set print settings
pd.set_option('display.width', 1600, 'display.max_colwidth', 800)
Common variables:
Note: Make sure the following files are located at the input path
Note: Create features extra (Run only once)
In [ ]:
# settings
feature_table = 'tcarer_features'
featureExtra_table = 'tcarer_featuresExtra'
In [ ]:
result = readers_writers.load_mysql_procedure("tcarer_set_featuresExtra", [feature_table, featureExtra_table], schema)
In [16]:
# select the target variable
target_feature = "label365" # "label365", "label30"
method_name = "rfc" # "rfc", "gbrt", "randLogit", "wdnn"
rank_models = ["rfc"] # ["rfc", "gbrt", "randLogit"]
Load pre-processed features
In [17]:
file_name = "Step_07_Features"
features = readers_writers.load_serialised_compressed(path=CONSTANTS.io_path, title=file_name)
# print
print("File size: ", os.stat(os.path.join(CONSTANTS.io_path, file_name + ".bz2")).st_size)
print("Number of columns: ", len(features["train_indep"].columns))
print("features: {train: ", len(features["train_indep"]), ", test: ", len(features["test_indep"]), "}")
In [18]:
file_name = "Step_07_Top_Features_rfc_adhoc"
features_names_selected = readers_writers.load_csv(path=CONSTANTS.io_path, title=file_name, dataframing=False)[0]
features_names_selected = [f.replace("\n", "") for f in features_names_selected]
display(pd.DataFrame(features_names_selected))
Initialise
In [19]:
training_method = TrainingMethod(method_name)
# file name
file_name = "Step_09_Model_" + method_name + "_" + target_feature
Load the model
In [20]:
training_method.load(path=CONSTANTS.io_path, title=file_name)
In [21]:
class TrainingMethodTensorflow:
def __init__(self, summaries, features_names, num_features, cut_off, train_size, test_size):
self.model_predict = {"train": {'score': [], 'model_labels': []},
"test": {'score': [], 'model_labels': []}}
self.__stats = Stats()
# summaries["fit"]["get_variable_names"]
# summaries["fit"]["get_variable_value"]
# summaries["fit"]["get_params"]
# summaries["fit"]["export"]
# summaries["fit"]["get_variable_names()"]
# summaries["fit"]["params"]
# summaries["fit"]["dnn_bias_"]
# summaries["fit"]["dnn_weights_"]
# summaries["train"]["results"]
# summaries["test"]["results"]
self.model_predict["train"]['pred'] = np.asarray([1 if i[1] >= 0.5 else 0 for i in summaries["train"]["predict_proba"]][0:train_size])
self.model_predict["test"]['pred'] = np.asarray([1 if i[1] >= 0.5 else 0 for i in summaries["test"]["predict_proba"]][0:test_size])
self.model_predict["train"]['score'] = np.asarray([i[1] for i in summaries["train"]["predict_proba"]][0:train_size])
self.model_predict["test"]['score'] = np.asarray([i[1] for i in summaries["test"]["predict_proba"]][0:test_size])
self.model_predict["train"]['score_0'] = np.asarray([i[0] for i in summaries["train"]["predict_proba"]][0:train_size])
self.model_predict["test"]['score_0'] = np.asarray([i[0] for i in summaries["test"]["predict_proba"]][0:test_size])
def train_summaries(self):
return {"feature_importances_": self.__weights}
def predict_summaries(self, feature_target, sample_name):
return self.__stats.predict_summaries(self.model_predict[sample_name], feature_target)
In [22]:
file_name = "model_tensorflow_summaries_" + target_feature
summaries = readers_writers.load_serialised_compressed(path=CONSTANTS.io_path, title=file_name)
In [ ]:
num_features = 300
cut_off = 0.5
training_method = TrainingMethodTensorflow(summaries, features_names_selected, num_features, cut_off,
len(features["train_indep"].index), len(features["test_indep"].index))
Performance
In [23]:
# train
o_summaries = training_method.predict_summaries(features["train_target"][target_feature], "train")
for k in o_summaries.keys():
print(k, o_summaries[k])
print("\n")
# test
o_summaries = training_method.predict_summaries(features["test_target"][target_feature], "test")
for k in o_summaries.keys():
print(k, o_summaries[k])
Read the extra features
In [24]:
table = 'tcarer_featuresExtra'
features_extra_dtypes = {'patientID': 'U32', 'trigger_charlsonFoster': 'i4', 'trigger_los': 'i4', 'trigger_age': 'i4', 'prior_admiOther': 'i4', 'prior_admiAcute': 'i4',
'prior_spells': 'i4', 'prior_asthma': 'i4', 'prior_copd': 'i4', 'prior_depression': 'i4', 'prior_diabetes': 'i4', 'prior_hypertension': 'i4', 'prior_cancer': 'i4', 'prior_chd': 'i4', 'prior_chf': 'i4',
'diagCci_01_myocardial_freq': 'i4', 'diagCci_02_chf_freq': 'i4', 'diagCci_03_pvd_freq': 'i4', 'diagCci_04_cerebrovascular_freq': 'i4', 'diagCci_05_dementia_freq': 'i4', 'diagCci_06_cpd_freq': 'i4', 'diagCci_07_rheumatic_freq': 'i4', 'diagCci_08_ulcer_freq': 'i4', 'diagCci_09_liverMild_freq': 'i4', 'diagCci_10_diabetesNotChronic_freq': 'i4', 'diagCci_11_diabetesChronic_freq': 'i4', 'diagCci_12_hemiplegia_freq': 'i4', 'diagCci_13_renal_freq': 'i4', 'diagCci_14_malignancy_freq': 'i4', 'diagCci_15_liverSevere_freq': 'i4', 'diagCci_16_tumorSec_freq': 'i4', 'diagCci_17_aids_freq': 'i4', 'diagCci_18_depression_freq': 'i4', 'diagCci_19_cardiac_freq': 'i4', 'diagCci_20_valvular_freq': 'i4', 'diagCci_21_pulmonary_freq': 'i4', 'diagCci_22_vascular_freq': 'i4', 'diagCci_23_hypertensionNotComplicated_freq': 'i4', 'diagCci_24_hypertensionComplicated_freq': 'i4', 'diagCci_25_paralysis_freq': 'i4', 'diagCci_26_neuroOther_freq': 'i4', 'diagCci_27_pulmonaryChronic_freq': 'i4', 'diagCci_28_diabetesNotComplicated_freq': 'i4', 'diagCci_29_diabetesComplicated_freq': 'i4', 'diagCci_30_hypothyroidism_freq': 'i4', 'diagCci_31_renal_freq': 'i4', 'diagCci_32_liver_freq': 'i4', 'diagCci_33_ulcerNotBleeding_freq': 'i4', 'diagCci_34_psychoses_freq': 'i4', 'diagCci_35_lymphoma_freq': 'i4', 'diagCci_36_cancerSec_freq': 'i4', 'diagCci_37_tumorNotSec_freq': 'i4', 'diagCci_38_rheumatoid_freq': 'i4', 'diagCci_39_coagulopathy_freq': 'i4', 'diagCci_40_obesity_freq': 'i4', 'diagCci_41_weightLoss_freq': 'i4', 'diagCci_42_fluidDisorder_freq': 'i4', 'diagCci_43_bloodLoss_freq': 'i4', 'diagCci_44_anemia_freq': 'i4', 'diagCci_45_alcohol_freq': 'i4', 'diagCci_46_drug_freq': 'i4'}
features_extra_name = features_extra_dtypes.keys()
In [25]:
# Read features from the MySQL
features_extra = dict()
features_extra['train'] = readers_writers.load_mysql_table(schema, table, dataframing=True)
features_extra['train'].astype(dtype=features_extra_dtypes)
features_extra['test'] = features_extra['train']
print("Number of columns: ", len(features_extra['train'].columns), "; Total records: ", len(features_extra['train'].index))
Replace NaN appears in the Charlson-Index feature
In [ ]:
features_extra['train'].loc[:, "trigger_charlsonFoster"] = np.nan_to_num(features_extra['train']["trigger_charlsonFoster"])
features_extra['test'].loc[:, "trigger_charlsonFoster"] = np.nan_to_num(features_extra['test']["trigger_charlsonFoster"])
Combine (join by PatientID)
In [ ]:
features_extra['train'] = features_extra['train'].merge(
pd.concat([features['train_id'], features['train_target'],
pd.DataFrame({'score': training_method.model_predict["train"]['score']}), features['train_indep']], axis=1),
how="inner", on="patientID")
features_extra['test'] = features_extra['test'].merge(
pd.concat([features['test_id'], features['test_target'],
pd.DataFrame({'score': training_method.model_predict["test"]['score']}), features['test_indep']], axis=1),
how="inner", on="patientID")
Clean-up
In [ ]:
features = None
gc.collect()
Algorithm 1: Random Forest
In [ ]:
charlson_method_name = "rfc"
kwargs = {"n_estimators": 20, "criterion": 'gini', "max_depth": None, "min_samples_split": 100,
"min_samples_leaf": 50, "min_weight_fraction_leaf": 0.0, "max_features": 'auto',
"max_leaf_nodes": None, "bootstrap": True, "oob_score": False, "n_jobs": -1, "random_state": None,
"verbose": 0, "warm_start": False, "class_weight": "balanced_subsample"}
Algorithm 2: Logistic Regression
In [ ]:
charlson_method_name = "lr"
kwargs = {"penalty": 'l2', "dual": False, "tol": 0.0001, "C": 1, "fit_intercept": True, "intercept_scaling": 1,
"class_weight": None, "random_state": None, "solver": 'liblinear', "max_iter": 100, "multi_class": 'ovr',
"verbose": 0, "warm_start": False, "n_jobs": -1}
In [ ]:
# set features
charlson_features_names = ['trigger_charlsonFoster']
In [ ]:
# select the target variable
charlson_target_feature = "label30" # "label30", "label365"
# file name
file_name = "report_Model_Charlson_" + charlson_method_name + "_" + charlson_target_feature
# initialise
charlson_training_method = TrainingMethod(charlson_method_name)
Fit Model
In [ ]:
o_summaries = dict()
# Fit
model = charlson_training_method.train(features_extra["train"][charlson_features_names], features_extra["train"][target_feature], **kwargs)
charlson_training_method.save_model(path=CONSTANTS.io_path, title=file_name)
In [ ]:
# load model
# charlson_training_method.load(path=CONSTANTS.io_path, title=file_name)
In [ ]:
# short summary
o_summaries = charlson_training_method.train_summaries()
Fit Performance
In [ ]:
o_summaries = dict()
model = charlson_training_method.predict(features_extra["train"][charlson_features_names], "train")
In [ ]:
# short summary
o_summaries = charlson_training_method.predict_summaries(pd.Series(features_extra["train"][target_feature]), "train")
print("ROC AUC:", o_summaries['roc_auc_score_1'], "\n", o_summaries['classification_report'])
for k in o_summaries.keys():
print(k, o_summaries[k])
In [ ]:
o_summaries = dict()
model = charlson_training_method.predict(features_extra["test"][charlson_features_names], "test")
In [ ]:
# short summary
o_summaries = charlson_training_method.predict_summaries(pd.Series(features_extra["test"][target_feature]), "test")
print("ROC AUC:", o_summaries['roc_auc_score_1'], "\n", o_summaries['classification_report'])
for k in o_summaries.keys():
print(k, o_summaries[k])
In [ ]:
o_summaries = dict()
score = charlson_training_method.cross_validate(features_extra["test"][charlson_features_names], features_extra["test"][target_feature],
scoring="neg_mean_squared_error", cv=10)
In [ ]:
# short summary
o_summaries = charlson_training_method.cross_validate_summaries()
print("Scores: ", o_summaries)
In [ ]:
charlson_training_method.save_model(path=CONSTANTS.io_path, title=file_name)
It is produced during modelling
It is produced during modelling
In [ ]:
def features_importance_rank(fitting_method, ranking_file_name=None, rank_models=["rfc", "gbrt", "randLogit"]):
# Fitting weight
o_summaries = pd.DataFrame({"Name": fitting_method.model_labels,
"Fitting Weight": fitting_method.train_summaries()["feature_importances_"]},
index = fitting_method.model_labels)
o_summaries = o_summaries.sort_values("Fitting Weight", ascending=False)
o_summaries = o_summaries.reset_index(drop=True)
# Ranking scores
if ranking_file_name is not None:
for rank_model in rank_models:
o_summaries_ranks = readers_writers.load_serialised_compressed(
path=CONSTANTS.io_path, title=ranking_file_name + rank_model)
for trial in range(len(o_summaries_ranks)):
o_summaries_rank = pd.DataFrame(o_summaries_ranks[trial])
o_summaries_rank.columns = ["Name", "Importance - " + rank_model + " - Trial_" + str(trial),
"Order - " + rank_model + " - Trial_" + str(trial)]
o_summaries = o_summaries.merge(o_summaries_rank, how="outer", on="Name")
return o_summaries
In [ ]:
file_name = "Step_07_Model_Train_model_rank_summaries_"
o_summaries = features_importance_rank(fitting_method=training_method, ranking_file_name=file_name, rank_models=rank_models)
file_name = "report_weights_ranks"
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name, data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
display(o_summaries.head())
In [ ]:
measures = ["accuracy_score", "precision_score", "recall_score",
"roc_auc_score_1", "f1_score", "fbeta_score", "average_precision_score",
"log_loss", "zero_one_loss", "hamming_loss", "jaccard_similarity_score", "matthews_corrcoef"]
In [ ]:
# train
o_summaries = training_method.predict_summaries(features_extra["train"][target_feature], "train")
o_summaries = np.array([(m, o_summaries[m]) for m in measures])
report_performance = pd.DataFrame({"Measure": o_summaries[:, 0],
"Sample Train": o_summaries[:, 1],
"Sample Test": [None] * len(measures)})
# test
o_summaries = training_method.predict_summaries(features_extra["test"][target_feature], "test")
o_summaries = np.array([(m, o_summaries[m]) for m in measures])
report_performance["Sample Test"] = o_summaries[:, 1]
In [ ]:
# print
file_name = "report_performance_" + method_name + "_" + target_feature
display(report_performance)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name, data=report_performance, append=False)
In [ ]:
def population_statistics(df, diagnoses, cutpoints=[0.50, 0.60, 0.70, 0.80, 0.90]):
o_summaries = pd.DataFrame(columns=['Name'], index=diagnoses)
o_summaries['Name'] = diagnoses
for diagnose in diagnoses:
o_summaries.loc[diagnose, 'Total'] = len(df.index)
if diagnose not in df:
continue
o_summaries.loc[diagnose, 'Total - diagnose'] = len(df.loc[(df[diagnose] > 0)].index)
o_summaries.loc[diagnose, 'Total - diagnose - label_1'] = len(df.loc[(df[diagnose] > 0) & (df[target_feature] > 0)].index)
o_summaries.loc[diagnose, 'Emergency Readmission Rate - cnt 1'] = len(df.loc[(df[diagnose] > 0) & (df['admimeth_0t30d_prevalence_1_cnt'] > 0) & (df[target_feature] > 0)].index)
o_summaries.loc[diagnose, 'Emergency Readmission Rate - cnt 2'] = len(df.loc[(df[diagnose] > 0) & (df['admimeth_0t30d_prevalence_2_cnt'] > 0) & (df[target_feature] > 0)].index)
o_summaries.loc[diagnose, 'Emergency Readmission Rate - cnt 3'] = len(df.loc[(df[diagnose] > 0) & (df['admimeth_0t30d_prevalence_3_cnt'] > 0) & (df[target_feature] > 0)].index)
o_summaries.loc[diagnose, 'Prior Spells'] = len(df.loc[(df[diagnose] > 0) & (df['prior_spells'] > 0) & (df[target_feature] > 0)].index)
o_summaries.loc[diagnose, 'Male - perc'] = len(df.loc[(df[diagnose] > 0) & (df['gender_1'] > 0) & (df[target_feature] > 0)].index)
age = df.loc[(df[diagnose] > 0) & (df[target_feature] > 0)]['trigger_age'].describe(percentiles=[.25, .5, .75])
o_summaries.loc[diagnose, 'Age - IQR_min'] = age['min']
o_summaries.loc[diagnose, 'Age - IQR_25'] = age['25%']
o_summaries.loc[diagnose, 'Age - IQR_50'] = age['50%']
o_summaries.loc[diagnose, 'Age - IQR_75'] = age['75%']
o_summaries.loc[diagnose, 'Age - IQR_max'] = age['max']
los = df.loc[(df[diagnose] > 0) & (df[target_feature] > 0)]['trigger_los'].describe(percentiles=[.25, .5, .75])
o_summaries.loc[diagnose, 'LoS - IQR_min'] = los['min']
o_summaries.loc[diagnose, 'LoS - IQR_25'] = los['25%']
o_summaries.loc[diagnose, 'LoS - IQR_50'] = los['50%']
o_summaries.loc[diagnose, 'LoS - IQR_75'] = los['75%']
o_summaries.loc[diagnose, 'LoS - IQR_max'] = los['max']
for cutpoint in cutpoints:
o_summaries.loc[diagnose, 'score - ' + str(cutpoint)] = len(df.loc[(df[diagnose] > 0) & (df['score'] > cutpoint)].index)
o_summaries.loc[diagnose, 'TP - ' + str(cutpoint)] = len(df.loc[(df[diagnose] > 0) & (df[target_feature] > 0) & (df['score'] > cutpoint)].index)
o_summaries.loc[diagnose, 'FP - ' + str(cutpoint)] = len(df.loc[(df[diagnose] > 0) & (df[target_feature] == 0) & (df['score'] > cutpoint)].index)
o_summaries.loc[diagnose, 'FN - ' + str(cutpoint)] = len(df.loc[(df[diagnose] > 0) & (df[target_feature] > 0) & (df['score'] <= cutpoint)].index)
o_summaries.loc[diagnose, 'TN - ' + str(cutpoint)] = len(df.loc[(df[diagnose] > 0) & (df[target_feature] == 0) & (df['score'] <= cutpoint)].index)
o_summaries.loc[diagnose, 'Charlson - 0'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 0)].index)
o_summaries.loc[diagnose, 'Charlson - 0 - label_1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 0) & (df[target_feature] > 0)].index)
o_summaries.loc[diagnose, 'Charlson - 1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 1)].index)
o_summaries.loc[diagnose, 'Charlson - 1 - label_1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 1) & (df[target_feature] > 0)].index)
o_summaries.loc[diagnose, 'Charlson - 2'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 2)].index)
o_summaries.loc[diagnose, 'Charlson - 2 - label_1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 2) & (df[target_feature] > 0)].index)
o_summaries.loc[diagnose, 'Charlson - 3'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 3)].index)
o_summaries.loc[diagnose, 'Charlson - 3 - label_1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 3) & (df[target_feature] > 0)].index)
o_summaries.loc[diagnose, 'Charlson - 4+'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] >= 4)].index)
o_summaries.loc[diagnose, 'Charlson - 4+ - label_1'] = len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] >= 4) & (df[target_feature] > 0)].index)
for cutpoint in cutpoints:
o_summaries.loc[diagnose, 'Charlson - 0 - label_1 - TP - ' + str(cutpoint)] = \
len(df.loc[(df[diagnose] > 0) & (df["trigger_charlsonFoster"] == 0) & (df[target_feature] > 0) & (df['score'] > cutpoint)].index)
return o_summaries
In [ ]:
diagnoses = ['diagCCS_0t30d_others_cnt', 'diagCCS_0t30d_prevalence_1_cnt', 'diagCCS_0t30d_prevalence_2_cnt', 'diagCCS_0t30d_prevalence_3_cnt', 'diagCCS_0t30d_prevalence_4_cnt', 'diagCCS_0t30d_prevalence_5_cnt', 'diagCCS_0t30d_prevalence_6_cnt', 'diagCCS_0t30d_prevalence_7_cnt', 'diagCCS_0t30d_prevalence_8_cnt', 'diagCCS_0t30d_prevalence_9_cnt', 'diagCCS_0t30d_prevalence_10_cnt', 'diagCCS_0t30d_prevalence_11_cnt', 'diagCCS_0t30d_prevalence_12_cnt', 'diagCCS_0t30d_prevalence_13_cnt', 'diagCCS_0t30d_prevalence_14_cnt', 'diagCCS_0t30d_prevalence_15_cnt', 'diagCCS_0t30d_prevalence_16_cnt', 'diagCCS_0t30d_prevalence_17_cnt', 'diagCCS_0t30d_prevalence_18_cnt', 'diagCCS_0t30d_prevalence_19_cnt', 'diagCCS_0t30d_prevalence_20_cnt', 'diagCCS_0t30d_prevalence_21_cnt', 'diagCCS_0t30d_prevalence_22_cnt', 'diagCCS_0t30d_prevalence_23_cnt', 'diagCCS_0t30d_prevalence_24_cnt', 'diagCCS_0t30d_prevalence_25_cnt', 'diagCCS_0t30d_prevalence_26_cnt', 'diagCCS_0t30d_prevalence_27_cnt', 'diagCCS_0t30d_prevalence_28_cnt', 'diagCCS_0t30d_prevalence_29_cnt', 'diagCCS_0t30d_prevalence_30_cnt'
, 'diagCCS_30t90d_others_cnt', 'diagCCS_30t90d_prevalence_1_cnt', 'diagCCS_30t90d_prevalence_2_cnt', 'diagCCS_30t90d_prevalence_3_cnt', 'diagCCS_30t90d_prevalence_4_cnt', 'diagCCS_30t90d_prevalence_5_cnt', 'diagCCS_30t90d_prevalence_6_cnt', 'diagCCS_30t90d_prevalence_7_cnt', 'diagCCS_30t90d_prevalence_8_cnt', 'diagCCS_30t90d_prevalence_9_cnt', 'diagCCS_30t90d_prevalence_10_cnt', 'diagCCS_30t90d_prevalence_11_cnt', 'diagCCS_30t90d_prevalence_12_cnt', 'diagCCS_30t90d_prevalence_13_cnt', 'diagCCS_30t90d_prevalence_14_cnt', 'diagCCS_30t90d_prevalence_15_cnt', 'diagCCS_30t90d_prevalence_16_cnt', 'diagCCS_30t90d_prevalence_17_cnt', 'diagCCS_30t90d_prevalence_18_cnt', 'diagCCS_30t90d_prevalence_19_cnt', 'diagCCS_30t90d_prevalence_20_cnt', 'diagCCS_30t90d_prevalence_21_cnt', 'diagCCS_30t90d_prevalence_22_cnt', 'diagCCS_30t90d_prevalence_23_cnt', 'diagCCS_30t90d_prevalence_24_cnt', 'diagCCS_30t90d_prevalence_25_cnt', 'diagCCS_30t90d_prevalence_26_cnt', 'diagCCS_30t90d_prevalence_27_cnt', 'diagCCS_30t90d_prevalence_28_cnt', 'diagCCS_30t90d_prevalence_29_cnt', 'diagCCS_30t90d_prevalence_30_cnt'
, 'diagCCS_90t180d_others_cnt', 'diagCCS_90t180d_prevalence_1_cnt', 'diagCCS_90t180d_prevalence_2_cnt', 'diagCCS_90t180d_prevalence_3_cnt', 'diagCCS_90t180d_prevalence_4_cnt', 'diagCCS_90t180d_prevalence_5_cnt', 'diagCCS_90t180d_prevalence_6_cnt', 'diagCCS_90t180d_prevalence_7_cnt', 'diagCCS_90t180d_prevalence_8_cnt', 'diagCCS_90t180d_prevalence_9_cnt', 'diagCCS_90t180d_prevalence_10_cnt', 'diagCCS_90t180d_prevalence_11_cnt', 'diagCCS_90t180d_prevalence_12_cnt', 'diagCCS_90t180d_prevalence_13_cnt', 'diagCCS_90t180d_prevalence_14_cnt', 'diagCCS_90t180d_prevalence_15_cnt', 'diagCCS_90t180d_prevalence_16_cnt', 'diagCCS_90t180d_prevalence_17_cnt', 'diagCCS_90t180d_prevalence_18_cnt', 'diagCCS_90t180d_prevalence_19_cnt', 'diagCCS_90t180d_prevalence_20_cnt', 'diagCCS_90t180d_prevalence_21_cnt', 'diagCCS_90t180d_prevalence_22_cnt', 'diagCCS_90t180d_prevalence_23_cnt', 'diagCCS_90t180d_prevalence_24_cnt', 'diagCCS_90t180d_prevalence_25_cnt', 'diagCCS_90t180d_prevalence_26_cnt', 'diagCCS_90t180d_prevalence_27_cnt', 'diagCCS_90t180d_prevalence_28_cnt', 'diagCCS_90t180d_prevalence_29_cnt', 'diagCCS_90t180d_prevalence_30_cnt'
, 'diagCCS_180t365d_others_cnt', 'diagCCS_180t365d_prevalence_1_cnt', 'diagCCS_180t365d_prevalence_2_cnt', 'diagCCS_180t365d_prevalence_3_cnt', 'diagCCS_180t365d_prevalence_4_cnt', 'diagCCS_180t365d_prevalence_5_cnt', 'diagCCS_180t365d_prevalence_6_cnt', 'diagCCS_180t365d_prevalence_7_cnt', 'diagCCS_180t365d_prevalence_8_cnt', 'diagCCS_180t365d_prevalence_9_cnt', 'diagCCS_180t365d_prevalence_10_cnt', 'diagCCS_180t365d_prevalence_11_cnt', 'diagCCS_180t365d_prevalence_12_cnt', 'diagCCS_180t365d_prevalence_13_cnt', 'diagCCS_180t365d_prevalence_14_cnt', 'diagCCS_180t365d_prevalence_15_cnt', 'diagCCS_180t365d_prevalence_16_cnt', 'diagCCS_180t365d_prevalence_17_cnt', 'diagCCS_180t365d_prevalence_18_cnt', 'diagCCS_180t365d_prevalence_19_cnt', 'diagCCS_180t365d_prevalence_20_cnt', 'diagCCS_180t365d_prevalence_21_cnt', 'diagCCS_180t365d_prevalence_22_cnt', 'diagCCS_180t365d_prevalence_23_cnt', 'diagCCS_180t365d_prevalence_24_cnt', 'diagCCS_180t365d_prevalence_25_cnt', 'diagCCS_180t365d_prevalence_26_cnt', 'diagCCS_180t365d_prevalence_27_cnt', 'diagCCS_180t365d_prevalence_28_cnt', 'diagCCS_180t365d_prevalence_29_cnt', 'diagCCS_180t365d_prevalence_30_cnt'
, 'diagCCS_365t730d_others_cnt', 'diagCCS_365t730d_prevalence_1_cnt', 'diagCCS_365t730d_prevalence_2_cnt', 'diagCCS_365t730d_prevalence_3_cnt', 'diagCCS_365t730d_prevalence_4_cnt', 'diagCCS_365t730d_prevalence_5_cnt', 'diagCCS_365t730d_prevalence_6_cnt', 'diagCCS_365t730d_prevalence_7_cnt', 'diagCCS_365t730d_prevalence_8_cnt', 'diagCCS_365t730d_prevalence_9_cnt', 'diagCCS_365t730d_prevalence_10_cnt', 'diagCCS_365t730d_prevalence_11_cnt', 'diagCCS_365t730d_prevalence_12_cnt', 'diagCCS_365t730d_prevalence_13_cnt', 'diagCCS_365t730d_prevalence_14_cnt', 'diagCCS_365t730d_prevalence_15_cnt', 'diagCCS_365t730d_prevalence_16_cnt', 'diagCCS_365t730d_prevalence_17_cnt', 'diagCCS_365t730d_prevalence_18_cnt', 'diagCCS_365t730d_prevalence_19_cnt', 'diagCCS_365t730d_prevalence_20_cnt', 'diagCCS_365t730d_prevalence_21_cnt', 'diagCCS_365t730d_prevalence_22_cnt', 'diagCCS_365t730d_prevalence_23_cnt', 'diagCCS_365t730d_prevalence_24_cnt', 'diagCCS_365t730d_prevalence_25_cnt', 'diagCCS_365t730d_prevalence_26_cnt', 'diagCCS_365t730d_prevalence_27_cnt', 'diagCCS_365t730d_prevalence_28_cnt', 'diagCCS_365t730d_prevalence_29_cnt', 'diagCCS_365t730d_prevalence_30_cnt']
file_name = "report_population_prevalent_diagnoses_" + method_name + "_" + target_feature + "_"
In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
In [ ]:
diagnoses = ['prior_admiOther', 'prior_admiAcute', 'prior_spells', 'prior_asthma', 'prior_copd', 'prior_depression', 'prior_diabetes', 'prior_hypertension', 'prior_cancer', 'prior_chd', 'prior_chf']
file_name = "report_population_comorbidity_diagnoses_" + method_name + "_" + target_feature + "_"
In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
In [ ]:
diagnoses = ['diagCci_01_myocardial_freq', 'diagCci_02_chf_freq', 'diagCci_03_pvd_freq', 'diagCci_04_cerebrovascular_freq', 'diagCci_05_dementia_freq', 'diagCci_06_cpd_freq', 'diagCci_07_rheumatic_freq', 'diagCci_08_ulcer_freq', 'diagCci_09_liverMild_freq', 'diagCci_10_diabetesNotChronic_freq', 'diagCci_11_diabetesChronic_freq', 'diagCci_12_hemiplegia_freq', 'diagCci_13_renal_freq', 'diagCci_14_malignancy_freq', 'diagCci_15_liverSevere_freq', 'diagCci_16_tumorSec_freq', 'diagCci_17_aids_freq', 'diagCci_18_depression_freq', 'diagCci_19_cardiac_freq', 'diagCci_20_valvular_freq', 'diagCci_21_pulmonary_freq', 'diagCci_22_vascular_freq', 'diagCci_23_hypertensionNotComplicated_freq', 'diagCci_24_hypertensionComplicated_freq', 'diagCci_25_paralysis_freq', 'diagCci_26_neuroOther_freq', 'diagCci_27_pulmonaryChronic_freq', 'diagCci_28_diabetesNotComplicated_freq', 'diagCci_29_diabetesComplicated_freq', 'diagCci_30_hypothyroidism_freq', 'diagCci_31_renal_freq', 'diagCci_32_liver_freq', 'diagCci_33_ulcerNotBleeding_freq', 'diagCci_34_psychoses_freq', 'diagCci_35_lymphoma_freq', 'diagCci_36_cancerSec_freq', 'diagCci_37_tumorNotSec_freq', 'diagCci_38_rheumatoid_freq', 'diagCci_39_coagulopathy_freq', 'diagCci_40_obesity_freq', 'diagCci_41_weightLoss_freq', 'diagCci_42_fluidDisorder_freq', 'diagCci_43_bloodLoss_freq', 'diagCci_44_anemia_freq', 'diagCci_45_alcohol_freq', 'diagCci_46_drug_freq']
file_name = "report_population_charlson_diagnoses_" + method_name + "_" + target_feature + "_"
In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
In [ ]:
diagnoses = ['operOPCSL1_0t30d_others_cnt', 'operOPCSL1_0t30d_prevalence_1_cnt', 'operOPCSL1_0t30d_prevalence_2_cnt', 'operOPCSL1_0t30d_prevalence_3_cnt', 'operOPCSL1_0t30d_prevalence_4_cnt', 'operOPCSL1_0t30d_prevalence_5_cnt', 'operOPCSL1_0t30d_prevalence_6_cnt', 'operOPCSL1_0t30d_prevalence_7_cnt', 'operOPCSL1_0t30d_prevalence_8_cnt', 'operOPCSL1_0t30d_prevalence_9_cnt', 'operOPCSL1_0t30d_prevalence_10_cnt', 'operOPCSL1_0t30d_prevalence_11_cnt', 'operOPCSL1_0t30d_prevalence_12_cnt', 'operOPCSL1_0t30d_prevalence_13_cnt', 'operOPCSL1_0t30d_prevalence_14_cnt', 'operOPCSL1_0t30d_prevalence_15_cnt', 'operOPCSL1_0t30d_prevalence_16_cnt', 'operOPCSL1_0t30d_prevalence_17_cnt', 'operOPCSL1_0t30d_prevalence_18_cnt', 'operOPCSL1_0t30d_prevalence_19_cnt', 'operOPCSL1_0t30d_prevalence_20_cnt', 'operOPCSL1_0t30d_prevalence_21_cnt', 'operOPCSL1_0t30d_prevalence_22_cnt', 'operOPCSL1_0t30d_prevalence_23_cnt', 'operOPCSL1_0t30d_prevalence_24_cnt', 'operOPCSL1_0t30d_prevalence_25_cnt', 'operOPCSL1_0t30d_prevalence_26_cnt', 'operOPCSL1_0t30d_prevalence_27_cnt', 'operOPCSL1_0t30d_prevalence_28_cnt', 'operOPCSL1_0t30d_prevalence_29_cnt', 'operOPCSL1_0t30d_prevalence_30_cnt'
, 'operOPCSL1_30t90d_others_cnt', 'operOPCSL1_30t90d_prevalence_1_cnt', 'operOPCSL1_30t90d_prevalence_2_cnt', 'operOPCSL1_30t90d_prevalence_3_cnt', 'operOPCSL1_30t90d_prevalence_4_cnt', 'operOPCSL1_30t90d_prevalence_5_cnt', 'operOPCSL1_30t90d_prevalence_6_cnt', 'operOPCSL1_30t90d_prevalence_7_cnt', 'operOPCSL1_30t90d_prevalence_8_cnt', 'operOPCSL1_30t90d_prevalence_9_cnt', 'operOPCSL1_30t90d_prevalence_10_cnt', 'operOPCSL1_30t90d_prevalence_11_cnt', 'operOPCSL1_30t90d_prevalence_12_cnt', 'operOPCSL1_30t90d_prevalence_13_cnt', 'operOPCSL1_30t90d_prevalence_14_cnt', 'operOPCSL1_30t90d_prevalence_15_cnt', 'operOPCSL1_30t90d_prevalence_16_cnt', 'operOPCSL1_30t90d_prevalence_17_cnt', 'operOPCSL1_30t90d_prevalence_18_cnt', 'operOPCSL1_30t90d_prevalence_19_cnt', 'operOPCSL1_30t90d_prevalence_20_cnt', 'operOPCSL1_30t90d_prevalence_21_cnt', 'operOPCSL1_30t90d_prevalence_22_cnt', 'operOPCSL1_30t90d_prevalence_23_cnt', 'operOPCSL1_30t90d_prevalence_24_cnt', 'operOPCSL1_30t90d_prevalence_25_cnt', 'operOPCSL1_30t90d_prevalence_26_cnt', 'operOPCSL1_30t90d_prevalence_27_cnt', 'operOPCSL1_30t90d_prevalence_28_cnt', 'operOPCSL1_30t90d_prevalence_29_cnt', 'operOPCSL1_30t90d_prevalence_30_cnt'
, 'operOPCSL1_90t180d_others_cnt', 'operOPCSL1_90t180d_prevalence_1_cnt', 'operOPCSL1_90t180d_prevalence_2_cnt', 'operOPCSL1_90t180d_prevalence_3_cnt', 'operOPCSL1_90t180d_prevalence_4_cnt', 'operOPCSL1_90t180d_prevalence_5_cnt', 'operOPCSL1_90t180d_prevalence_6_cnt', 'operOPCSL1_90t180d_prevalence_7_cnt', 'operOPCSL1_90t180d_prevalence_8_cnt', 'operOPCSL1_90t180d_prevalence_9_cnt', 'operOPCSL1_90t180d_prevalence_10_cnt', 'operOPCSL1_90t180d_prevalence_11_cnt', 'operOPCSL1_90t180d_prevalence_12_cnt', 'operOPCSL1_90t180d_prevalence_13_cnt', 'operOPCSL1_90t180d_prevalence_14_cnt', 'operOPCSL1_90t180d_prevalence_15_cnt', 'operOPCSL1_90t180d_prevalence_16_cnt', 'operOPCSL1_90t180d_prevalence_17_cnt', 'operOPCSL1_90t180d_prevalence_18_cnt', 'operOPCSL1_90t180d_prevalence_19_cnt', 'operOPCSL1_90t180d_prevalence_20_cnt', 'operOPCSL1_90t180d_prevalence_21_cnt', 'operOPCSL1_90t180d_prevalence_22_cnt', 'operOPCSL1_90t180d_prevalence_23_cnt', 'operOPCSL1_90t180d_prevalence_24_cnt', 'operOPCSL1_90t180d_prevalence_25_cnt', 'operOPCSL1_90t180d_prevalence_26_cnt', 'operOPCSL1_90t180d_prevalence_27_cnt', 'operOPCSL1_90t180d_prevalence_28_cnt', 'operOPCSL1_90t180d_prevalence_29_cnt', 'operOPCSL1_90t180d_prevalence_30_cnt'
, 'operOPCSL1_180t365d_others_cnt', 'operOPCSL1_180t365d_prevalence_1_cnt', 'operOPCSL1_180t365d_prevalence_2_cnt', 'operOPCSL1_180t365d_prevalence_3_cnt', 'operOPCSL1_180t365d_prevalence_4_cnt', 'operOPCSL1_180t365d_prevalence_5_cnt', 'operOPCSL1_180t365d_prevalence_6_cnt', 'operOPCSL1_180t365d_prevalence_7_cnt', 'operOPCSL1_180t365d_prevalence_8_cnt', 'operOPCSL1_180t365d_prevalence_9_cnt', 'operOPCSL1_180t365d_prevalence_10_cnt', 'operOPCSL1_180t365d_prevalence_11_cnt', 'operOPCSL1_180t365d_prevalence_12_cnt', 'operOPCSL1_180t365d_prevalence_13_cnt', 'operOPCSL1_180t365d_prevalence_14_cnt', 'operOPCSL1_180t365d_prevalence_15_cnt', 'operOPCSL1_180t365d_prevalence_16_cnt', 'operOPCSL1_180t365d_prevalence_17_cnt', 'operOPCSL1_180t365d_prevalence_18_cnt', 'operOPCSL1_180t365d_prevalence_19_cnt', 'operOPCSL1_180t365d_prevalence_20_cnt', 'operOPCSL1_180t365d_prevalence_21_cnt', 'operOPCSL1_180t365d_prevalence_22_cnt', 'operOPCSL1_180t365d_prevalence_23_cnt', 'operOPCSL1_180t365d_prevalence_24_cnt', 'operOPCSL1_180t365d_prevalence_25_cnt', 'operOPCSL1_180t365d_prevalence_26_cnt', 'operOPCSL1_180t365d_prevalence_27_cnt', 'operOPCSL1_180t365d_prevalence_28_cnt', 'operOPCSL1_180t365d_prevalence_29_cnt', 'operOPCSL1_180t365d_prevalence_30_cnt'
, 'operOPCSL1_365t730d_others_cnt', 'operOPCSL1_365t730d_prevalence_1_cnt', 'operOPCSL1_365t730d_prevalence_2_cnt', 'operOPCSL1_365t730d_prevalence_3_cnt', 'operOPCSL1_365t730d_prevalence_4_cnt', 'operOPCSL1_365t730d_prevalence_5_cnt', 'operOPCSL1_365t730d_prevalence_6_cnt', 'operOPCSL1_365t730d_prevalence_7_cnt', 'operOPCSL1_365t730d_prevalence_8_cnt', 'operOPCSL1_365t730d_prevalence_9_cnt', 'operOPCSL1_365t730d_prevalence_10_cnt', 'operOPCSL1_365t730d_prevalence_11_cnt', 'operOPCSL1_365t730d_prevalence_12_cnt', 'operOPCSL1_365t730d_prevalence_13_cnt', 'operOPCSL1_365t730d_prevalence_14_cnt', 'operOPCSL1_365t730d_prevalence_15_cnt', 'operOPCSL1_365t730d_prevalence_16_cnt', 'operOPCSL1_365t730d_prevalence_17_cnt', 'operOPCSL1_365t730d_prevalence_18_cnt', 'operOPCSL1_365t730d_prevalence_19_cnt', 'operOPCSL1_365t730d_prevalence_20_cnt', 'operOPCSL1_365t730d_prevalence_21_cnt', 'operOPCSL1_365t730d_prevalence_22_cnt', 'operOPCSL1_365t730d_prevalence_23_cnt', 'operOPCSL1_365t730d_prevalence_24_cnt', 'operOPCSL1_365t730d_prevalence_25_cnt', 'operOPCSL1_365t730d_prevalence_26_cnt', 'operOPCSL1_365t730d_prevalence_27_cnt', 'operOPCSL1_365t730d_prevalence_28_cnt', 'operOPCSL1_365t730d_prevalence_29_cnt', 'operOPCSL1_365t730d_prevalence_30_cnt']
file_name = "report_population_operations_" + method_name + "_" + target_feature + "_"
In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
In [ ]:
diagnoses = ['mainspef_0t30d_others_cnt', 'mainspef_0t30d_prevalence_1_cnt', 'mainspef_0t30d_prevalence_2_cnt', 'mainspef_0t30d_prevalence_3_cnt', 'mainspef_0t30d_prevalence_4_cnt', 'mainspef_0t30d_prevalence_5_cnt', 'mainspef_0t30d_prevalence_6_cnt', 'mainspef_0t30d_prevalence_7_cnt', 'mainspef_0t30d_prevalence_8_cnt', 'mainspef_0t30d_prevalence_9_cnt', 'mainspef_0t30d_prevalence_10_cnt'
, 'mainspef_30t90d_others_cnt', 'mainspef_30t90d_prevalence_1_cnt', 'mainspef_30t90d_prevalence_2_cnt', 'mainspef_30t90d_prevalence_3_cnt', 'mainspef_30t90d_prevalence_4_cnt', 'mainspef_30t90d_prevalence_5_cnt', 'mainspef_30t90d_prevalence_6_cnt', 'mainspef_30t90d_prevalence_7_cnt', 'mainspef_30t90d_prevalence_8_cnt', 'mainspef_30t90d_prevalence_9_cnt', 'mainspef_30t90d_prevalence_10_cnt'
, 'mainspef_90t180d_others_cnt', 'mainspef_90t180d_prevalence_1_cnt', 'mainspef_90t180d_prevalence_2_cnt', 'mainspef_90t180d_prevalence_3_cnt', 'mainspef_90t180d_prevalence_4_cnt', 'mainspef_90t180d_prevalence_5_cnt', 'mainspef_90t180d_prevalence_6_cnt', 'mainspef_90t180d_prevalence_7_cnt', 'mainspef_90t180d_prevalence_8_cnt', 'mainspef_90t180d_prevalence_9_cnt', 'mainspef_90t180d_prevalence_10_cnt'
, 'mainspef_180t365d_others_cnt', 'mainspef_180t365d_prevalence_1_cnt', 'mainspef_180t365d_prevalence_2_cnt', 'mainspef_180t365d_prevalence_3_cnt', 'mainspef_180t365d_prevalence_4_cnt', 'mainspef_180t365d_prevalence_5_cnt', 'mainspef_180t365d_prevalence_6_cnt', 'mainspef_180t365d_prevalence_7_cnt', 'mainspef_180t365d_prevalence_8_cnt', 'mainspef_180t365d_prevalence_9_cnt', 'mainspef_180t365d_prevalence_10_cnt'
, 'mainspef_365t730d_others_cnt', 'mainspef_365t730d_prevalence_1_cnt', 'mainspef_365t730d_prevalence_2_cnt', 'mainspef_365t730d_prevalence_3_cnt', 'mainspef_365t730d_prevalence_4_cnt', 'mainspef_365t730d_prevalence_5_cnt', 'mainspef_365t730d_prevalence_6_cnt', 'mainspef_365t730d_prevalence_7_cnt', 'mainspef_365t730d_prevalence_8_cnt', 'mainspef_365t730d_prevalence_9_cnt', 'mainspef_365t730d_prevalence_10_cnt']
file_name = "report_population_operations_" + method_name + "_" + target_feature + "_"
In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
In [ ]:
diagnoses = ['gapDays_0t30d_avg', 'gapDays_30t90d_avg', 'gapDays_90t180d_avg', 'gapDays_180t365d_avg', 'gapDays_365t730d_avg',
'epidur_0t30d_avg', 'epidur_30t90d_avg', 'epidur_90t180d_avg', 'epidur_180t365d_avg', 'epidur_365t730d_avg',
'preopdur_0t30d_avg', 'preopdur_30t90d_avg', 'preopdur_90t180d_avg', 'preopdur_180t365d_avg', 'preopdur_365t730d_avg',
'posopdur_0t30d_avg', 'posopdur_30t90d_avg', 'posopdur_90t180d_avg', 'posopdur_180t365d_avg', 'posopdur_365t730d_avg']
file_name = "report_population_other_variables_" + method_name + "_" + target_feature + "_"
In [ ]:
o_summaries = population_statistics(features_extra['train'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "train", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
o_summaries = population_statistics(features_extra['test'], diagnoses)
readers_writers.save_csv(path=CONSTANTS.io_path, title=file_name + "test", data=o_summaries, append=False, extension="csv", header=o_summaries.columns)
In [ ]:
file_name = "report_population_" + method_name + "_" + target_feature + "_"
In [ ]:
fig, summaries = plots.roc(training_method.model_predict["test"], features_extra["test"][target_feature],
title="ROC Curve", lw=2)
display(fig)
In [ ]:
# save
plt.savefig(os.path.join(CONSTANTS.io_path, file_name + "_roc" + ".pdf"),
dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format="pdf",
transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None)
In [ ]:
fig, summaries = plots.precision_recall(training_method.model_predict["test"],
features_extra["test"][target_feature],
title="Precision-Recall Curve", lw=2)
display(fig)
In [ ]:
# save
plt.savefig(os.path.join(CONSTANTS.io_path, file_name + "_precision_recall" + ".pdf"),
dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format="pdf",
transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None)
In [ ]:
fig, summaries = plots.learning_curve(training_method.model_train,
features_extra["test"][features_names_selected],
features_extra["test"][target_feature],
title="Learning Curve", ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5))
display(fig)
In [ ]:
# save
plt.savefig(os.path.join(CONSTANTS.io_path, file_name + "_learning_curve" + ".pdf"),
dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format="pdf",
transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None)
Set the model's metadata
In [ ]:
# method metadata
if method_name == "lr":
param_name = "clf__C"
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
elif method_name == "rfc":
param_name = "max_features"
param_range = range(1, 4, 1) # range(1, 20, 1)
elif method_name == "nn":
param_name = "alpha"
param_range = range(1e4, 1e6, 9e4)
In [ ]:
fig, summaries = plots.validation_curve(training_method.model_train,
features_extra["test"][features_names_selected],
features_extra["test"][target_feature],
param_name, param_range,
title="Learning Curve", ylim=None, cv=None, lw=2, n_jobs=-1)
display(fig)
In [ ]:
# save
plt.savefig(os.path.join(CONSTANTS.io_path, file_name + "_validation_curve" + ".pdf"),
dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format="pdf",
transparent=False, bbox_inches=None, pad_inches=0.1, frameon=None)
Fin!