In [673]:
#TO RE-RUN
%reset -f

In [674]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

from operator import truediv
from sklearn import metrics
import pandas as pd
import time
import os

from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt


np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.2f}'.format
plt.style.use('classic')

%matplotlib inline

import sys
sys.path.insert(1, "../../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter

In [675]:
typeEncounter = "last" #["last","first"]

In [676]:
#df_all=pd.read_csv(os.path.join('resources','diabetic_data_processed_withweight.csv'),';')
df_all=pd.read_pickle(os.path.join('resources','clean_data_' + typeEncounter + '.pkl'))
print df_all.shape
print df_all.columns.tolist()
print df_all.readmitted.value_counts()
print df_all.readmitted.value_counts()/float(df_all.shape[0])


(67182, 194)
['gender', 'age', 'weight', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'Change', 'diabetesMed', 'Diabetis_3', 'Infectious and parasitic diseases_3', 'Neoplasms_3', 'Endocrine_3', 'Blood_3', 'Mental_3', 'Nervous_3', 'Organs_3', 'Circulatory_3', 'Respiratory_3', 'Digestive_3', 'Genitourinary_3', 'Pregnancy_3', 'Skin_3', 'Muscoskeletal_3', 'Congenital_3', 'Perinatal_3', 'Ill-defined_3', 'Poisoning_3', 'Diabetis_1', 'Infectious and parasitic diseases_1', 'Neoplasms_1', 'Endocrine_1', 'Blood_1', 'Mental_1', 'Nervous_1', 'Organs_1', 'Circulatory_1', 'Respiratory_1', 'Digestive_1', 'Genitourinary_1', 'Pregnancy_1', 'Skin_1', 'Muscoskeletal_1', 'Congenital_1', 'Perinatal_1', 'Ill-defined_1', 'medSpec_AllergyandImmunology', 'medSpec_Anesthesiology', 'medSpec_Anesthesiology-Pediatric', 'medSpec_Cardiology', 'medSpec_Cardiology-Pediatric', 'medSpec_DCPTEAM', 'medSpec_Dentistry', 'medSpec_Dermatology', 'medSpec_Emergency/Trauma', 'medSpec_Endocrinology', 'medSpec_Endocrinology-Metabolism', 'medSpec_Family/GeneralPractice', 'medSpec_Gastroenterology', 'medSpec_Gynecology', 'medSpec_Hematology', 'medSpec_Hematology/Oncology', 'medSpec_Hospitalist', 'medSpec_InfectiousDiseases', 'medSpec_InternalMedicine', 'medSpec_Nephrology', 'medSpec_Neurology', 'medSpec_Neurophysiology', 'medSpec_Obsterics&Gynecology-GynecologicOnco', 'medSpec_Obstetrics', 'medSpec_ObstetricsandGynecology', 'medSpec_Oncology', 'medSpec_Ophthalmology', 'medSpec_Orthopedics', 'medSpec_Orthopedics-Reconstructive', 'medSpec_Osteopath', 'medSpec_Otolaryngology', 'medSpec_OutreachServices', 'medSpec_Pathology', 'medSpec_Pediatrics', 'medSpec_Pediatrics-CriticalCare', 'medSpec_Pediatrics-EmergencyMedicine', 'medSpec_Pediatrics-Endocrinology', 'medSpec_Pediatrics-Hematology-Oncology', 'medSpec_Pediatrics-Neurology', 'medSpec_Pediatrics-Pulmonology', 'medSpec_Perinatology', 'medSpec_PhysicalMedicineandRehabilitation', 'medSpec_PhysicianNotFound', 'medSpec_Podiatry', 'medSpec_Proctology', 'medSpec_Psychiatry', 'medSpec_Psychiatry-Addictive', 'medSpec_Psychiatry-Child/Adolescent', 'medSpec_Psychology', 'medSpec_Pulmonology', 'medSpec_Radiologist', 'medSpec_Radiology', 'medSpec_Resident', 'medSpec_Rheumatology', 'medSpec_Speech', 'medSpec_SportsMedicine', 'medSpec_Surgeon', 'medSpec_Surgery-Cardiovascular', 'medSpec_Surgery-Cardiovascular/Thoracic', 'medSpec_Surgery-Colon&Rectal', 'medSpec_Surgery-General', 'medSpec_Surgery-Maxillofacial', 'medSpec_Surgery-Neuro', 'medSpec_Surgery-Pediatric', 'medSpec_Surgery-Plastic', 'medSpec_Surgery-PlasticwithinHeadandNeck', 'medSpec_Surgery-Thoracic', 'medSpec_Surgery-Vascular', 'medSpec_SurgicalSpecialty', 'medSpec_Urology', 'race_AfricanAmerican', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Other', 'adm_1', 'adm_2', 'adm_3', 'adm_4', 'adm_7', 'adm_src_1', 'adm_src_2', 'adm_src_3', 'adm_src_4', 'adm_src_5', 'adm_src_6', 'adm_src_7', 'adm_src_8', 'adm_src_10', 'adm_src_11', 'adm_src_13', 'adm_src_14', 'adm_src_22', 'adm_src_25', 'diss_1', 'diss_2', 'diss_3', 'diss_4', 'diss_5', 'diss_6', 'diss_7', 'diss_8', 'diss_9', 'diss_10', 'diss_12', 'diss_13', 'diss_14', 'diss_15', 'diss_16', 'diss_17', 'diss_22', 'diss_23', 'diss_24', 'diss_27', 'diss_28', 'Poisoning_1', 'External_causes_1', 'External_causes_3', 'readmitted']
0    39785
2    21403
1     5994
Name: readmitted, dtype: int64
0   0.59
2   0.32
1   0.09
Name: readmitted, dtype: float64

Features preparation


In [677]:
df_all["HbA1c"] = -1
df_all["HbA1c"][df_all.A1Cresult == 2] = 3
df_all["HbA1c"][df_all.A1Cresult == 1] = 2
df_all["HbA1c"][df_all.A1Cresult == 0] = 1
df_all["HbA1c"][df_all.A1Cresult.isnull()] = 0

pd.value_counts(df_all.HbA1c)


/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
Out[677]:
0    52657
3     6833
1     4397
2     3295
Name: HbA1c, dtype: int64

In [678]:
#Not useful
df_all["ComplexHbA1c"] = 0
df_all["ComplexHbA1c"][np.logical_and(df_all.A1Cresult == 2, df_all.Change == 1)] = 2
df_all["ComplexHbA1c"][np.logical_and(df_all.A1Cresult == 2, df_all.Change == 0)] = 1

pd.value_counts(df_all.ComplexHbA1c)


/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
Out[678]:
0    60349
2     4395
1     2438
Name: ComplexHbA1c, dtype: int64

In [679]:
#Not useful
df_all["age_cat"] = -1
df_all["age_cat"][df_all.age.isin([0,1,2])] = 0
df_all["age_cat"][df_all.age.isin([3,4,5,6])] = 1
df_all["age_cat"][df_all.age.isin([7,8,9,10])] = 2

print pd.value_counts(df_all["age_cat"])


1    35869
2    29571
0     1742
Name: age_cat, dtype: int64
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """

In [680]:
#Not useful
df_all["adm_src_ref"] = np.sum(df_all[["adm_src_1","adm_src_2","adm_src_3"]].values,axis=1)>0
df_all["adm_src_em"] = df_all["adm_src_7"].values

print pd.value_counts(df_all["adm_src_ref"])
print pd.value_counts(df_all["adm_src_em"])


False    45345
True     21837
Name: adm_src_ref, dtype: int64
1    36050
0    31132
Name: adm_src_em, dtype: int64

In [681]:
df_all["medSpec_cardio"] = np.sum(
    df_all[['medSpec_Cardiology', 'medSpec_Cardiology-Pediatric']], axis=1)>0

pd.value_counts(df_all["medSpec_cardio"])


Out[681]:
False    62857
True      4325
Name: medSpec_cardio, dtype: int64

In [682]:
df_all["medSpec_surgery"] = np.sum(
    df_all[['medSpec_Surgeon', 'medSpec_Surgery-Cardiovascular', 'medSpec_Surgery-Cardiovascular/Thoracic', 
            'medSpec_Surgery-Colon&Rectal', 'medSpec_Surgery-General', 'medSpec_Surgery-Maxillofacial', 
            'medSpec_Surgery-Neuro', 'medSpec_Surgery-Pediatric', 'medSpec_Surgery-Plastic', 
            'medSpec_Surgery-Thoracic', 'medSpec_Surgery-Vascular', 
            'medSpec_SurgicalSpecialty', 'medSpec_Surgery-PlasticwithinHeadandNeck']], axis=1)>0

pd.value_counts(df_all["medSpec_surgery"])


Out[682]:
False    63383
True      3799
Name: medSpec_surgery, dtype: int64

In [683]:
df_all["number_treatment"] = -1
df_all["number_treatment"] = np.sum(df_all[[ u'metformin', u'repaglinide', u'nateglinide',
       u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
       u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
       u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
       u'citoglipton', u'insulin', u'glyburide-metformin',
       u'glipizide-metformin', u'glimepiride-pioglitazone',
       u'metformin-rosiglitazone', u'metformin-pioglitazone']], axis=1)

print np.unique(df_all["number_treatment"])
print pd.value_counts(df_all["number_treatment"])
print pd.value_counts(df_all["number_treatment"])/df_all.shape[0]


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]
2     20971
0     15699
4     10374
3      6594
1      4655
6      3494
5      3122
7      1268
8       678
9       230
10       68
11       21
12        7
13        1
Name: number_treatment, dtype: int64
2    0.31
0    0.23
4    0.15
3    0.10
1    0.07
6    0.05
5    0.05
7    0.02
8    0.01
9    0.00
10   0.00
11   0.00
12   0.00
13   0.00
Name: number_treatment, dtype: float64

In [684]:
df_all["number_treatment_0"] = -1
df_all["number_treatment_0"] = np.sum(df_all[[ u'metformin', u'repaglinide', u'nateglinide',
       u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
       u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
       u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
       u'citoglipton', u'insulin', u'glyburide-metformin',
       u'glipizide-metformin', u'glimepiride-pioglitazone',
       u'metformin-rosiglitazone', u'metformin-pioglitazone']]==0, axis=1)
print np.unique(df_all["number_treatment_0"])
print pd.value_counts(df_all["number_treatment_0"])
print pd.value_counts(df_all["number_treatment_0"])/df_all.shape[0]

df_all["number_treatment_1"] = -1
df_all["number_treatment_1"] = np.sum(df_all[[ u'metformin', u'repaglinide', u'nateglinide',
       u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
       u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
       u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
       u'citoglipton', u'insulin', u'glyburide-metformin',
       u'glipizide-metformin', u'glimepiride-pioglitazone',
       u'metformin-rosiglitazone', u'metformin-pioglitazone']]==1, axis=1)
print np.unique(df_all["number_treatment_1"])
print pd.value_counts(df_all["number_treatment_1"])
print pd.value_counts(df_all["number_treatment_1"])/df_all.shape[0]

df_all["number_treatment_2"] = -1
df_all["number_treatment_2"] = np.sum(df_all[[ u'metformin', u'repaglinide', u'nateglinide',
       u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
       u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
       u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
       u'citoglipton', u'insulin', u'glyburide-metformin',
       u'glipizide-metformin', u'glimepiride-pioglitazone',
       u'metformin-rosiglitazone', u'metformin-pioglitazone']]==2, axis=1)

print np.unique(df_all["number_treatment_2"])
print pd.value_counts(df_all["number_treatment_2"])
print pd.value_counts(df_all["number_treatment_2"])/df_all.shape[0]

df_all["number_treatment_3"] = -1
df_all["number_treatment_3"] = np.sum(df_all[[ u'metformin', u'repaglinide', u'nateglinide',
       u'chlorpropamide', u'glimepiride', u'acetohexamide', u'glipizide',
       u'glyburide', u'tolbutamide', u'pioglitazone', u'rosiglitazone',
       u'acarbose', u'miglitol', u'troglitazone', u'tolazamide', u'examide',
       u'citoglipton', u'insulin', u'glyburide-metformin',
       u'glipizide-metformin', u'glimepiride-pioglitazone',
       u'metformin-rosiglitazone', u'metformin-pioglitazone']]==3, axis=1)

print np.unique(df_all["number_treatment_3"])
print pd.value_counts(df_all["number_treatment_3"])
print pd.value_counts(df_all["number_treatment_3"])/df_all.shape[0]


[17 18 19 20 21 22 23]
22    29987
23    15699
21    14922
20     5536
19      987
18       49
17        2
Name: number_treatment_0, dtype: int64
22   0.45
23   0.23
21   0.22
20   0.08
19   0.01
18   0.00
17   0.00
Name: number_treatment_0, dtype: float64
[0 1 2 3]
0    58722
1     8278
2      179
3        3
Name: number_treatment_1, dtype: int64
0   0.87
1   0.12
2   0.00
3   0.00
Name: number_treatment_1, dtype: float64
[0 1 2 3 4 5 6]
1    25647
0    25251
2    12215
3     3557
4      494
5       17
6        1
Name: number_treatment_2, dtype: int64
1   0.38
0   0.38
2   0.18
3   0.05
4   0.01
5   0.00
6   0.00
Name: number_treatment_2, dtype: float64
[0 1 2 3]
0    58425
1     8348
2      391
3       18
Name: number_treatment_3, dtype: int64
0   0.87
1   0.12
2   0.01
3   0.00
Name: number_treatment_3, dtype: float64

In [685]:
print df_all.shape

df_all["diss_home"] = -1
df_all["diss_home"] = np.sum(df_all[['diss_1','diss_13']], axis=1)

print pd.value_counts(df_all["diss_home"])


(67182, 206)
1    44560
0    22622
Name: diss_home, dtype: int64

In [686]:
#Extra columns
print df_all.shape

df_all["add_in_out"] = -1
df_all["add_in_out"] = np.sum(df_all[['number_inpatient','number_outpatient']], axis=1)

df_all["add_procs_meds"] = -1
df_all["add_procs_meds"] = np.sum(df_all[['num_lab_procedures','num_procedures', 'num_medications']], axis=1).astype(float)

df_all["div_visits_time"] = -1
df_all["div_visits_time"] = np.sum(df_all[['number_inpatient','number_outpatient']], axis=1)/df_all["time_in_hospital"].astype(float)

df_all["div_em_time"] = -1
df_all["div_em_time"] = np.sum(df_all[['number_emergency']], axis=1)/df_all["time_in_hospital"].astype(float)

df_all["div_visit_med"] = -1
df_all["div_visit_med"] = np.sum(df_all[['number_inpatient','number_outpatient']], axis=1)/df_all["num_medications"].astype(float)

df_all["div_em_med"] = -1
df_all["div_em_med"] = np.sum(df_all[['number_emergency']], axis=1)/df_all["num_medications"].astype(float)

df_all["sum_ch_med"] = -1
df_all["sum_ch_med"] = np.sum(df_all[['diabetesMed','Change']], axis=1)

df_all["kk"] = 1
print pd.pivot_table(df_all, values="kk", index ="readmitted" ,columns="sum_ch_med", aggfunc=np.sum)

print df_all[df_all["readmitted"]>0][['readmitted','sum_ch_med','time_in_hospital','number_emergency',
              'num_lab_procedures','num_medications', 
              "add_procs_meds","div_visits_time","div_em_time","div_visit_med","div_em_med"]].head(10)


(67182, 207)
sum_ch_med      0      1      2
readmitted                     
0           10137  12136  17512
1            1181   1941   2872
2            4381   6815  10207
             readmitted  sum_ch_med  time_in_hospital  number_emergency  \
patient_nbr                                                               
135                   1           2                 8                 0   
1152                  2           1                 6                 0   
1314                  2           1                 2                 0   
1629                  2           1                14                 1   
5220                  2           1                 2                 0   
5337                  2           2                 7                 0   
6696                  2           1                 1                 0   
10827                 2           0                 2                 0   
11394                 2           2                 7                 0   
11511                 1           1                 3                 0   

             num_lab_procedures  num_medications  add_procs_meds  \
patient_nbr                                                        
135                          77               33          116.00   
1152                         43               13           58.00   
1314                         50               13           68.00   
1629                         21               15           36.00   
5220                         15               14           29.00   
5337                         27               16           46.00   
6696                         36                8           44.00   
10827                        39               17           56.00   
11394                        62               14           78.00   
11511                        56               10           66.00   

             div_visits_time  div_em_time  div_visit_med  div_em_med  
patient_nbr                                                           
135                     0.00         0.00           0.00        0.00  
1152                    0.17         0.00           0.08        0.00  
1314                    0.00         0.00           0.00        0.00  
1629                    0.36         0.07           0.33        0.07  
5220                    0.00         0.00           0.00        0.00  
5337                    0.00         0.00           0.00        0.00  
6696                    0.00         0.00           0.00        0.00  
10827                   0.00         0.00           0.00        0.00  
11394                   0.00         0.00           0.00        0.00  
11511                   0.00         0.00           0.00        0.00  

In [687]:
colsFinal = ['gender', 'age',"race_AfricanAmerican","race_Caucasian","race_Other", 
             'HbA1c',
             "Change",
             'time_in_hospital',
             'diabetesMed',
             "diss_home",             
             "medSpec_cardio","medSpec_Family/GeneralPractice","medSpec_InternalMedicine","medSpec_surgery",             
             'adm_src_1', 'adm_src_2', 'adm_src_3', 'adm_src_4', 'adm_src_5', 'adm_src_6', 'adm_src_7', 'adm_src_8', 
             'adm_src_10', 'adm_src_11', 'adm_src_13', 'adm_src_14', 'adm_src_22', 'adm_src_25',
              u'adm_1', u'adm_2', u'adm_3', u'adm_4', u'adm_7', 
             "number_treatment",
             'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 
             'number_emergency', 'number_inpatient', 'number_diagnoses']

colsFinalExtended = colsFinal[:]
colsFinalExtended.extend([
    "insulin","metformin","pioglitazone","glimepiride","glipizide","repaglinide","nateglinide"])
colsFinalExtendedExtra = colsFinalExtended[:]
colsFinalExtendedExtra.extend([
    "ComplexHbA1c",
    "add_in_out","add_procs_meds", "div_visits_time", "div_em_time", "div_visit_med","div_em_med", 
    "sum_ch_med",
    "number_treatment_0","number_treatment_1","number_treatment_2","number_treatment_3"])

colsFinal.extend(["readmitted"])
colsFinalExtended.extend(["readmitted"])
colsFinalExtendedExtra.extend(["readmitted"])

print len(colsFinal), len(colsFinalExtended), len(colsFinalExtendedExtra)


42 49 61

Save


In [695]:
typeCols = ["_reduced","_extended", "_extended_extra"]

for typeC in typeCols:

    if typeC == "_reduced":
        colsDiag = colsFinal
    if typeC == "_extended":
        colsDiag = colsFinalExtended
    if typeC == "_extended_extra":        
        colsDiag = colsFinalExtendedExtra
        
    print len(colsDiag)
    print colsDiag

    print typeEncounter, df_all[colsDiag].shape
    df_all[colsDiag].to_pickle(os.path.join('resources','prepared_clean_data_' + typeEncounter + typeC + '.pkl'))
    print "file:", os.path.join('resources','prepared_clean_data_' + typeEncounter + typeC +  '.pkl')


42
['gender', 'age', 'race_AfricanAmerican', 'race_Caucasian', 'race_Other', 'HbA1c', 'Change', 'time_in_hospital', 'diabetesMed', 'diss_home', 'medSpec_cardio', 'medSpec_Family/GeneralPractice', 'medSpec_InternalMedicine', 'medSpec_surgery', 'adm_src_1', 'adm_src_2', 'adm_src_3', 'adm_src_4', 'adm_src_5', 'adm_src_6', 'adm_src_7', 'adm_src_8', 'adm_src_10', 'adm_src_11', 'adm_src_13', 'adm_src_14', 'adm_src_22', 'adm_src_25', u'adm_1', u'adm_2', u'adm_3', u'adm_4', u'adm_7', 'number_treatment', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'readmitted']
last (67182, 42)
file: resources/prepared_clean_data_last_reduced.pkl
49
['gender', 'age', 'race_AfricanAmerican', 'race_Caucasian', 'race_Other', 'HbA1c', 'Change', 'time_in_hospital', 'diabetesMed', 'diss_home', 'medSpec_cardio', 'medSpec_Family/GeneralPractice', 'medSpec_InternalMedicine', 'medSpec_surgery', 'adm_src_1', 'adm_src_2', 'adm_src_3', 'adm_src_4', 'adm_src_5', 'adm_src_6', 'adm_src_7', 'adm_src_8', 'adm_src_10', 'adm_src_11', 'adm_src_13', 'adm_src_14', 'adm_src_22', 'adm_src_25', u'adm_1', u'adm_2', u'adm_3', u'adm_4', u'adm_7', 'number_treatment', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'insulin', 'metformin', 'pioglitazone', 'glimepiride', 'glipizide', 'repaglinide', 'nateglinide', 'readmitted']
last (67182, 49)
file: resources/prepared_clean_data_last_extended.pkl
61
['gender', 'age', 'race_AfricanAmerican', 'race_Caucasian', 'race_Other', 'HbA1c', 'Change', 'time_in_hospital', 'diabetesMed', 'diss_home', 'medSpec_cardio', 'medSpec_Family/GeneralPractice', 'medSpec_InternalMedicine', 'medSpec_surgery', 'adm_src_1', 'adm_src_2', 'adm_src_3', 'adm_src_4', 'adm_src_5', 'adm_src_6', 'adm_src_7', 'adm_src_8', 'adm_src_10', 'adm_src_11', 'adm_src_13', 'adm_src_14', 'adm_src_22', 'adm_src_25', u'adm_1', u'adm_2', u'adm_3', u'adm_4', u'adm_7', 'number_treatment', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'insulin', 'metformin', 'pioglitazone', 'glimepiride', 'glipizide', 'repaglinide', 'nateglinide', 'ComplexHbA1c', 'add_in_out', 'add_procs_meds', 'div_visits_time', 'div_em_time', 'div_visit_med', 'div_em_med', 'sum_ch_med', 'number_treatment_0', 'number_treatment_1', 'number_treatment_2', 'number_treatment_3', 'readmitted']
last (67182, 61)
file: resources/prepared_clean_data_last_extended_extra.pkl

Creating other cols


In [699]:
typeDiagnosis = "diag_3" #["diag_1", "diag_3"]

In [700]:
typeCols = ["_extended", "_extended_extra"]

for typeC in typeCols:
    
    if typeC == "_reduced":
        colsDiag = colsFinal[:-1]
    if typeC == "_extended":
        colsDiag = colsFinalExtended[:-1]
    if typeC == "_extended_extra":        
        colsDiag = colsFinalExtendedExtra[:-1]    
        
    if typeDiagnosis == "diag_1":
        auxCols = [
                "Diabetis_1","Circulatory_1",'Digestive_1','Genitourinary_1','Poisoning_1','Muscoskeletal_1',
                'Neoplasms_1','Respiratory_1'      
               ]

    if typeDiagnosis == "diag_3":    
        auxCols = [
                "Diabetis_3","Circulatory_3",'Digestive_3','Genitourinary_3','Poisoning_3','Muscoskeletal_3',
                'Neoplasms_3','Respiratory_3'       
               ]

    colsDiag.extend(auxCols)
    colsDiag.extend(["readmitted"])

    print len(colsDiag)
    print colsDiag

    print typeEncounter, df_all[colsDiag].shape
    df_all[colsDiag].to_pickle(os.path.join('resources','prepared_clean_data_' + typeEncounter + typeC + "_" +  typeDiagnosis  + '.pkl'))
    print "file:", os.path.join('resources','prepared_clean_data_' + typeEncounter  + typeC + "_" +  typeDiagnosis +  '.pkl')


57
['gender', 'age', 'race_AfricanAmerican', 'race_Caucasian', 'race_Other', 'HbA1c', 'Change', 'time_in_hospital', 'diabetesMed', 'diss_home', 'medSpec_cardio', 'medSpec_Family/GeneralPractice', 'medSpec_InternalMedicine', 'medSpec_surgery', 'adm_src_1', 'adm_src_2', 'adm_src_3', 'adm_src_4', 'adm_src_5', 'adm_src_6', 'adm_src_7', 'adm_src_8', 'adm_src_10', 'adm_src_11', 'adm_src_13', 'adm_src_14', 'adm_src_22', 'adm_src_25', u'adm_1', u'adm_2', u'adm_3', u'adm_4', u'adm_7', 'number_treatment', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'insulin', 'metformin', 'pioglitazone', 'glimepiride', 'glipizide', 'repaglinide', 'nateglinide', 'Diabetis_3', 'Circulatory_3', 'Digestive_3', 'Genitourinary_3', 'Poisoning_3', 'Muscoskeletal_3', 'Neoplasms_3', 'Respiratory_3', 'readmitted']
last (67182, 57)
file: resources/prepared_clean_data_last_extended_diag_3.pkl
69
['gender', 'age', 'race_AfricanAmerican', 'race_Caucasian', 'race_Other', 'HbA1c', 'Change', 'time_in_hospital', 'diabetesMed', 'diss_home', 'medSpec_cardio', 'medSpec_Family/GeneralPractice', 'medSpec_InternalMedicine', 'medSpec_surgery', 'adm_src_1', 'adm_src_2', 'adm_src_3', 'adm_src_4', 'adm_src_5', 'adm_src_6', 'adm_src_7', 'adm_src_8', 'adm_src_10', 'adm_src_11', 'adm_src_13', 'adm_src_14', 'adm_src_22', 'adm_src_25', u'adm_1', u'adm_2', u'adm_3', u'adm_4', u'adm_7', 'number_treatment', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'insulin', 'metformin', 'pioglitazone', 'glimepiride', 'glipizide', 'repaglinide', 'nateglinide', 'ComplexHbA1c', 'add_in_out', 'add_procs_meds', 'div_visits_time', 'div_em_time', 'div_visit_med', 'div_em_med', 'sum_ch_med', 'number_treatment_0', 'number_treatment_1', 'number_treatment_2', 'number_treatment_3', 'Diabetis_3', 'Circulatory_3', 'Digestive_3', 'Genitourinary_3', 'Poisoning_3', 'Muscoskeletal_3', 'Neoplasms_3', 'Respiratory_3', 'readmitted']
last (67182, 69)
file: resources/prepared_clean_data_last_extended_extra_diag_3.pkl

Save partitions


In [659]:
def train_test_partition(dd, ts_thr=0.30):
    y = dd.readmitted
    y = y.values
    X = dd.iloc[:,:-1].values
  
    sss = StratifiedShuffleSplit(y, 1, test_size=ts_thr, random_state=32) #random_state=42
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
    return X_train, X_test, y_train, y_test

In [660]:
import cPickle as pickle

path = './resources/partition_prepared_clean_data_' + typeEncounter + "_" +  typeDiagnosis

for cc in ["colsFinal", "colsFinalExtended", "colsFinalExtendedExtra","diag_1"]:
    if cc == "colsFinal":
        X_train, X_test, y_train, y_test = train_test_partition(df_all[colsFinal])
        print cc, len(colsFinal), X_train.shape, X_test.shape, y_train.shape, y_test.shape
        
    if cc == "colsFinalExtended":
        X_train, X_test, y_train, y_test = train_test_partition(df_all[colsFinalExtended])
        print cc, len(colsFinalExtended), X_train.shape, X_test.shape, y_train.shape, y_test.shape
        
    if cc == "colsFinalExtendedExtra":
        X_train, X_test, y_train, y_test = train_test_partition(df_all[colsFinalExtendedExtra])
        print cc, len(colsFinalExtendedExtra), X_train.shape, X_test.shape, y_train.shape, y_test.shape

    if cc == "diag_1":
        X_train, X_test, y_train, y_test = train_test_partition(df_all[colsDiag])
        print cc, len(colsDiag), X_train.shape, X_test.shape, y_train.shape, y_test.shape
        
    f = open(path + "_" + cc + ".npy", "wb")
    pickle.dump(X_train, f)
    pickle.dump(X_test, f)
    pickle.dump(y_train, f)
    pickle.dump(y_test, f)
    f.close()


colsFinal 42 (47027, 41) (20155, 41) (47027,) (20155,)
colsFinalExtended 49 (47027, 48) (20155, 48) (47027,) (20155,)
colsFinalExtendedExtra 61 (47027, 60) (20155, 60) (47027,) (20155,)
diag_1 69 (47027, 68) (20155, 68) (47027,) (20155,)

In [661]:
for cc in ["colsFinal", "colsFinalExtended", "colsFinalExtendedExtra","diag_1"]:
    f1 = open(path + "_" + cc + ".npy", "rb")
    print path + "_" + cc + ".npy"
    X_train = pickle.load(f1)
    X_test = pickle.load(f1)
    y_train = pickle.load(f1)
    y_test = pickle.load(f1)
    f1.close()
    print cc, X_train.shape, X_test.shape, y_train.shape, y_test.shape


./resources/partition_prepared_clean_data_last_diag_1_colsFinal.npy
colsFinal (47027, 41) (20155, 41) (47027,) (20155,)
./resources/partition_prepared_clean_data_last_diag_1_colsFinalExtended.npy
colsFinalExtended (47027, 48) (20155, 48) (47027,) (20155,)
./resources/partition_prepared_clean_data_last_diag_1_colsFinalExtendedExtra.npy
colsFinalExtendedExtra (47027, 60) (20155, 60) (47027,) (20155,)
./resources/partition_prepared_clean_data_last_diag_1_diag_1.npy
diag_1 (47027, 68) (20155, 68) (47027,) (20155,)

In [ ]: