In [673]:
#TO RE-RUN
%reset -f
In [674]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from operator import truediv
from sklearn import metrics
import pandas as pd
import time
import os
from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('classic')
%matplotlib inline
import sys
sys.path.insert(1, "../../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter
In [675]:
typeEncounter = "last" #["last","first"]
In [676]:
#df_all=pd.read_csv(os.path.join('resources','diabetic_data_processed_withweight.csv'),';')
df_all=pd.read_pickle(os.path.join('resources','clean_data_' + typeEncounter + '.pkl'))
print df_all.shape
print df_all.columns.tolist()
print df_all.readmitted.value_counts()
print df_all.readmitted.value_counts()/float(df_all.shape[0])
In [677]:
df_all["HbA1c"] = -1
df_all["HbA1c"][df_all.A1Cresult == 2] = 3
df_all["HbA1c"][df_all.A1Cresult == 1] = 2
df_all["HbA1c"][df_all.A1Cresult == 0] = 1
df_all["HbA1c"][df_all.A1Cresult.isnull()] = 0
pd.value_counts(df_all.HbA1c)
Out[677]:
In [678]:
#Not useful
df_all["ComplexHbA1c"] = 0
df_all["ComplexHbA1c"][np.logical_and(df_all.A1Cresult == 2, df_all.Change == 1)] = 2
df_all["ComplexHbA1c"][np.logical_and(df_all.A1Cresult == 2, df_all.Change == 0)] = 1
pd.value_counts(df_all.ComplexHbA1c)
Out[678]:
In [679]:
#Not useful
df_all["age_cat"] = -1
df_all["age_cat"][df_all.age.isin([0,1,2])] = 0
df_all["age_cat"][df_all.age.isin([3,4,5,6])] = 1
df_all["age_cat"][df_all.age.isin([7,8,9,10])] = 2
print pd.value_counts(df_all["age_cat"])
In [680]:
#Not useful
df_all["adm_src_ref"] = np.sum(df_all[["adm_src_1","adm_src_2","adm_src_3"]].values,axis=1)>0
df_all["adm_src_em"] = df_all["adm_src_7"].values
print pd.value_counts(df_all["adm_src_ref"])
print pd.value_counts(df_all["adm_src_em"])
In [681]:
df_all["medSpec_cardio"] = np.sum(
df_all[['medSpec_Cardiology', 'medSpec_Cardiology-Pediatric']], axis=1)>0
pd.value_counts(df_all["medSpec_cardio"])
Out[681]:
In [682]:
df_all["medSpec_surgery"] = np.sum(
df_all[['medSpec_Surgeon', 'medSpec_Surgery-Cardiovascular', 'medSpec_Surgery-Cardiovascular/Thoracic',
'medSpec_Surgery-Colon&Rectal', 'medSpec_Surgery-General', 'medSpec_Surgery-Maxillofacial',
'medSpec_Surgery-Neuro', 'medSpec_Surgery-Pediatric', 'medSpec_Surgery-Plastic',
'medSpec_Surgery-Thoracic', 'medSpec_Surgery-Vascular',
'medSpec_SurgicalSpecialty', 'medSpec_Surgery-PlasticwithinHeadandNeck']], axis=1)>0
pd.value_counts(df_all["medSpec_surgery"])
Out[682]:
In [683]:
df_all["number_treatment"] = -1
df_all["number_treatment"] = np.sum(df_all[[ u'metformin', u'repaglinide', u'nateglinide',
u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
u'citoglipton', u'insulin', u'glyburide-metformin',
u'glipizide-metformin', u'glimepiride-pioglitazone',
u'metformin-rosiglitazone', u'metformin-pioglitazone']], axis=1)
print np.unique(df_all["number_treatment"])
print pd.value_counts(df_all["number_treatment"])
print pd.value_counts(df_all["number_treatment"])/df_all.shape[0]
In [684]:
df_all["number_treatment_0"] = -1
df_all["number_treatment_0"] = np.sum(df_all[[ u'metformin', u'repaglinide', u'nateglinide',
u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
u'citoglipton', u'insulin', u'glyburide-metformin',
u'glipizide-metformin', u'glimepiride-pioglitazone',
u'metformin-rosiglitazone', u'metformin-pioglitazone']]==0, axis=1)
print np.unique(df_all["number_treatment_0"])
print pd.value_counts(df_all["number_treatment_0"])
print pd.value_counts(df_all["number_treatment_0"])/df_all.shape[0]
df_all["number_treatment_1"] = -1
df_all["number_treatment_1"] = np.sum(df_all[[ u'metformin', u'repaglinide', u'nateglinide',
u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
u'citoglipton', u'insulin', u'glyburide-metformin',
u'glipizide-metformin', u'glimepiride-pioglitazone',
u'metformin-rosiglitazone', u'metformin-pioglitazone']]==1, axis=1)
print np.unique(df_all["number_treatment_1"])
print pd.value_counts(df_all["number_treatment_1"])
print pd.value_counts(df_all["number_treatment_1"])/df_all.shape[0]
df_all["number_treatment_2"] = -1
df_all["number_treatment_2"] = np.sum(df_all[[ u'metformin', u'repaglinide', u'nateglinide',
u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
u'citoglipton', u'insulin', u'glyburide-metformin',
u'glipizide-metformin', u'glimepiride-pioglitazone',
u'metformin-rosiglitazone', u'metformin-pioglitazone']]==2, axis=1)
print np.unique(df_all["number_treatment_2"])
print pd.value_counts(df_all["number_treatment_2"])
print pd.value_counts(df_all["number_treatment_2"])/df_all.shape[0]
df_all["number_treatment_3"] = -1
df_all["number_treatment_3"] = np.sum(df_all[[ u'metformin', u'repaglinide', u'nateglinide',
u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
u'citoglipton', u'insulin', u'glyburide-metformin',
u'glipizide-metformin', u'glimepiride-pioglitazone',
u'metformin-rosiglitazone', u'metformin-pioglitazone']]==3, axis=1)
print np.unique(df_all["number_treatment_3"])
print pd.value_counts(df_all["number_treatment_3"])
print pd.value_counts(df_all["number_treatment_3"])/df_all.shape[0]
In [685]:
print df_all.shape
df_all["diss_home"] = -1
df_all["diss_home"] = np.sum(df_all[['diss_1','diss_13']], axis=1)
print pd.value_counts(df_all["diss_home"])
In [686]:
#Extra columns
print df_all.shape
df_all["add_in_out"] = -1
df_all["add_in_out"] = np.sum(df_all[['number_inpatient','number_outpatient']], axis=1)
df_all["add_procs_meds"] = -1
df_all["add_procs_meds"] = np.sum(df_all[['num_lab_procedures','num_procedures', 'num_medications']], axis=1).astype(float)
df_all["div_visits_time"] = -1
df_all["div_visits_time"] = np.sum(df_all[['number_inpatient','number_outpatient']], axis=1)/df_all["time_in_hospital"].astype(float)
df_all["div_em_time"] = -1
df_all["div_em_time"] = np.sum(df_all[['number_emergency']], axis=1)/df_all["time_in_hospital"].astype(float)
df_all["div_visit_med"] = -1
df_all["div_visit_med"] = np.sum(df_all[['number_inpatient','number_outpatient']], axis=1)/df_all["num_medications"].astype(float)
df_all["div_em_med"] = -1
df_all["div_em_med"] = np.sum(df_all[['number_emergency']], axis=1)/df_all["num_medications"].astype(float)
df_all["sum_ch_med"] = -1
df_all["sum_ch_med"] = np.sum(df_all[['diabetesMed','Change']], axis=1)
df_all["kk"] = 1
print pd.pivot_table(df_all, values="kk", index ="readmitted" ,columns="sum_ch_med", aggfunc=np.sum)
print df_all[df_all["readmitted"]>0][['readmitted','sum_ch_med','time_in_hospital','number_emergency',
'num_lab_procedures','num_medications',
"add_procs_meds","div_visits_time","div_em_time","div_visit_med","div_em_med"]].head(10)
In [687]:
colsFinal = ['gender', 'age',"race_AfricanAmerican","race_Caucasian","race_Other",
'HbA1c',
"Change",
'time_in_hospital',
'diabetesMed',
"diss_home",
"medSpec_cardio","medSpec_Family/GeneralPractice","medSpec_InternalMedicine","medSpec_surgery",
'adm_src_1', 'adm_src_2', 'adm_src_3', 'adm_src_4', 'adm_src_5', 'adm_src_6', 'adm_src_7', 'adm_src_8',
'adm_src_10', 'adm_src_11', 'adm_src_13', 'adm_src_14', 'adm_src_22', 'adm_src_25',
u'adm_1', u'adm_2', u'adm_3', u'adm_4', u'adm_7',
"number_treatment",
'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
'number_emergency', 'number_inpatient', 'number_diagnoses']
colsFinalExtended = colsFinal[:]
colsFinalExtended.extend([
"insulin","metformin","pioglitazone","glimepiride","glipizide","repaglinide","nateglinide"])
colsFinalExtendedExtra = colsFinalExtended[:]
colsFinalExtendedExtra.extend([
"ComplexHbA1c",
"add_in_out","add_procs_meds", "div_visits_time", "div_em_time", "div_visit_med","div_em_med",
"sum_ch_med",
"number_treatment_0","number_treatment_1","number_treatment_2","number_treatment_3"])
colsFinal.extend(["readmitted"])
colsFinalExtended.extend(["readmitted"])
colsFinalExtendedExtra.extend(["readmitted"])
print len(colsFinal), len(colsFinalExtended), len(colsFinalExtendedExtra)
In [695]:
typeCols = ["_reduced","_extended", "_extended_extra"]
for typeC in typeCols:
if typeC == "_reduced":
colsDiag = colsFinal
if typeC == "_extended":
colsDiag = colsFinalExtended
if typeC == "_extended_extra":
colsDiag = colsFinalExtendedExtra
print len(colsDiag)
print colsDiag
print typeEncounter, df_all[colsDiag].shape
df_all[colsDiag].to_pickle(os.path.join('resources','prepared_clean_data_' + typeEncounter + typeC + '.pkl'))
print "file:", os.path.join('resources','prepared_clean_data_' + typeEncounter + typeC + '.pkl')
In [699]:
typeDiagnosis = "diag_3" #["diag_1", "diag_3"]
In [700]:
typeCols = ["_extended", "_extended_extra"]
for typeC in typeCols:
if typeC == "_reduced":
colsDiag = colsFinal[:-1]
if typeC == "_extended":
colsDiag = colsFinalExtended[:-1]
if typeC == "_extended_extra":
colsDiag = colsFinalExtendedExtra[:-1]
if typeDiagnosis == "diag_1":
auxCols = [
"Diabetis_1","Circulatory_1",'Digestive_1','Genitourinary_1','Poisoning_1','Muscoskeletal_1',
'Neoplasms_1','Respiratory_1'
]
if typeDiagnosis == "diag_3":
auxCols = [
"Diabetis_3","Circulatory_3",'Digestive_3','Genitourinary_3','Poisoning_3','Muscoskeletal_3',
'Neoplasms_3','Respiratory_3'
]
colsDiag.extend(auxCols)
colsDiag.extend(["readmitted"])
print len(colsDiag)
print colsDiag
print typeEncounter, df_all[colsDiag].shape
df_all[colsDiag].to_pickle(os.path.join('resources','prepared_clean_data_' + typeEncounter + typeC + "_" + typeDiagnosis + '.pkl'))
print "file:", os.path.join('resources','prepared_clean_data_' + typeEncounter + typeC + "_" + typeDiagnosis + '.pkl')
In [659]:
def train_test_partition(dd, ts_thr=0.30):
y = dd.readmitted
y = y.values
X = dd.iloc[:,:-1].values
sss = StratifiedShuffleSplit(y, 1, test_size=ts_thr, random_state=32) #random_state=42
for train_index, test_index in sss:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
return X_train, X_test, y_train, y_test
In [660]:
import cPickle as pickle
path = './resources/partition_prepared_clean_data_' + typeEncounter + "_" + typeDiagnosis
for cc in ["colsFinal", "colsFinalExtended", "colsFinalExtendedExtra","diag_1"]:
if cc == "colsFinal":
X_train, X_test, y_train, y_test = train_test_partition(df_all[colsFinal])
print cc, len(colsFinal), X_train.shape, X_test.shape, y_train.shape, y_test.shape
if cc == "colsFinalExtended":
X_train, X_test, y_train, y_test = train_test_partition(df_all[colsFinalExtended])
print cc, len(colsFinalExtended), X_train.shape, X_test.shape, y_train.shape, y_test.shape
if cc == "colsFinalExtendedExtra":
X_train, X_test, y_train, y_test = train_test_partition(df_all[colsFinalExtendedExtra])
print cc, len(colsFinalExtendedExtra), X_train.shape, X_test.shape, y_train.shape, y_test.shape
if cc == "diag_1":
X_train, X_test, y_train, y_test = train_test_partition(df_all[colsDiag])
print cc, len(colsDiag), X_train.shape, X_test.shape, y_train.shape, y_test.shape
f = open(path + "_" + cc + ".npy", "wb")
pickle.dump(X_train, f)
pickle.dump(X_test, f)
pickle.dump(y_train, f)
pickle.dump(y_test, f)
f.close()
In [661]:
for cc in ["colsFinal", "colsFinalExtended", "colsFinalExtendedExtra","diag_1"]:
f1 = open(path + "_" + cc + ".npy", "rb")
print path + "_" + cc + ".npy"
X_train = pickle.load(f1)
X_test = pickle.load(f1)
y_train = pickle.load(f1)
y_test = pickle.load(f1)
f1.close()
print cc, X_train.shape, X_test.shape, y_train.shape, y_test.shape
In [ ]: