In [646]:
#TO RE-RUN
%reset -f
In [647]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from sklearn.metrics import pairwise_distances
from IPython.display import display, HTML
from operator import truediv
import pandas as pd
import time
import os
from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.3f}'.format
plt.style.use('classic')
%matplotlib inline
import sys
sys.path.insert(1, "../../src/")
from TypeFeatImputer import TypeFeatImputer
In [648]:
#df_all=pd.read_csv(os.path.join('resources','diabetic_data_processed_withweight.csv'),';')
#df_all=pd.read_csv(os.path.join('resources','diabetic_data_processed_2017_09_28.csv'),';')
df_all=pd.read_csv(os.path.join('resources','diabetic_data_extended_2017_09_29_14_xrp.csv'),',')
In [649]:
print df_all.shape
print df_all.columns
print len(df_all.patient_nbr.unique())
print df_all.readmitted.value_counts()
print df_all.race.unique()
In [650]:
typeEncounter = "last" #["first","last"]
In [651]:
if typeEncounter == "last":
df=df_all.groupby(['patient_nbr'], sort=True).last()
else:
df=df_all.groupby(['patient_nbr'], sort=True).first()
df.readmitted.value_counts()
print len(df.encounter_id.unique())
In [652]:
pd.value_counts(df.Diabetis_1)
Out[652]:
In [653]:
dfNull = np.sum(df.isnull()) / float(df.shape[0])
dfNullAll = pd.concat([dfNull[dfNull>0],np.sum(df.isnull()),np.sum(df.isnull()==False)], axis=1)
dfNullAll.columns= ["%_Null","#_Null","#_No_Null"]
display(dfNullAll[dfNullAll.iloc[:,0].isnull() == False][["%_Null","#_Null","#_No_Null"]])
print dfNullAll[dfNullAll.iloc[:,0].isnull() == False].shape
plt.figure(figsize=(15,4))
plt.bar(range(dfNull.shape[0]),dfNull.iloc[:], alpha=0.5)
plt.xticks(range(dfNull.shape[0]),dfNull.index,rotation=90)
plt.ylabel("% nulls")
plt.tight_layout()
plt.show
Out[653]:
In [654]:
to_del = []
In [655]:
#to_del = ["A1Cresult","max_glu_serum","weight","medical_specialty"]
In [656]:
#Filter_selected cols
cols = [c for c in df.columns if (c not in to_del) ]#and ('ENF' not in c)
dfFiltered = df[cols]
print dfFiltered.shape
print dfFiltered.columns
In [657]:
for c in dfFiltered.columns:
if dfFiltered[c].dtype == 'object':
print c, len(dfFiltered[c].unique()),":"
print pd.value_counts(dfFiltered[c]).index.tolist()
print
In [658]:
print dfFiltered.shape
ix = dfFiltered[np.logical_not(dfFiltered['discharge_disposition_id'].isin([11,18,19,20,21,25,26]))].index
print len(ix)
dfFiltered = dfFiltered.ix[ix,:]
print dfFiltered.shape
In [659]:
df_spec = pd.get_dummies(dfFiltered['medical_specialty'],dummy_na=False, prefix='medSpec')
for c in df_spec.columns:
print c
print pd.value_counts(df_spec[c])
df_clean = pd.concat([dfFiltered.iloc[:,:-1], df_spec], axis=1)
dfFiltered = pd.concat([df_clean, dfFiltered.iloc[:,-1]], axis=1)
print dfFiltered.shape
print dfFiltered.columns
In [660]:
dfFiltered["weight"][dfFiltered["weight"] == "[0-25)"] = 0
dfFiltered["weight"][dfFiltered["weight"] == "[25-50)"] = 1
dfFiltered["weight"][dfFiltered["weight"] == "[50-75)"] = 2
dfFiltered["weight"][dfFiltered["weight"] == "[75-100)"] = 3
dfFiltered["weight"][dfFiltered["weight"] == "[100-125)"] = 4
dfFiltered["weight"][dfFiltered["weight"] == "[125-150)"] = 5
dfFiltered["weight"][dfFiltered["weight"] == "[150-175)"] = 6
dfFiltered["weight"][dfFiltered["weight"] == "[175-200)"] = 7
dfFiltered["weight"][dfFiltered["weight"] == ">200"] = 8
print pd.value_counts(dfFiltered.weight)
print dfFiltered.shape
print dfFiltered.columns
In [661]:
print dfFiltered['race'].unique()
df_race = pd.get_dummies(dfFiltered['race'],dummy_na=False, prefix='race')
for c in df_race.columns:
print c
print pd.value_counts(df_race[c])
df_clean = pd.concat([dfFiltered.iloc[:,:-1], df_race], axis=1)
dfFiltered = pd.concat([df_clean, dfFiltered.iloc[:,-1]], axis=1)
print dfFiltered.shape
print dfFiltered.columns
In [662]:
df_adm = pd.get_dummies(dfFiltered['admission_type_id'],dummy_na=False, prefix='adm')
for c in df_adm.columns:
print c
print pd.value_counts(df_adm[c])
#Remove non_informative columns
selCols = []
for c in df_adm.columns:
if c not in ["adm_5", "adm_6","adm_8"]:
selCols.append(c)
print selCols
df_clean = pd.concat([dfFiltered.iloc[:,:-1], df_adm[selCols]], axis=1)
dfFiltered = pd.concat([df_clean, dfFiltered.iloc[:,-1]], axis=1)
print dfFiltered.shape
print dfFiltered.columns
In [663]:
df_adm = pd.get_dummies(dfFiltered['admission_source_id'],dummy_na=False, prefix='adm_src')
#Remove non_informative columns
selCols = []
for c in df_adm.columns:
if c not in ["adm_src_9", "adm_src_15","adm_src_17","adm_src_20","adm_src_21"]:
selCols.append(c)
print selCols
df_clean = pd.concat([dfFiltered.iloc[:,:-1], df_adm[selCols]], axis=1)
dfFiltered = pd.concat([df_clean, dfFiltered.iloc[:,-1]], axis=1)
for c in dfFiltered.columns:
if c in df_adm.columns:
print c, np.unique(dfFiltered[c])
print pd.value_counts(dfFiltered[c])
print dfFiltered.shape
print dfFiltered.columns
In [664]:
df_adm = pd.get_dummies(dfFiltered['discharge_disposition_id'],dummy_na=False, prefix='diss')
#Remove non_informative columns
selCols = []
colsToDel = ["diss_26","diss_25","diss_21","diss_20","diss_19","diss_18","diss_11"]
for c in df_adm.columns:
if c not in colsToDel:
selCols.append(c)
print selCols
df_clean = pd.concat([dfFiltered.iloc[:,:-1], df_adm[selCols]], axis=1)
dfFiltered = pd.concat([df_clean, dfFiltered.iloc[:,-1]], axis=1)
for c in dfFiltered.columns:
if c in df_adm.columns:
print c, np.unique(dfFiltered[c])
print pd.value_counts(dfFiltered[c])
print dfFiltered.shape
print dfFiltered.columns
In [665]:
to_del = ["race","admission_type_id","admission_source_id",'discharge_disposition_id','medical_specialty']
#Filter_selected cols
cols = [c for c in dfFiltered.columns if (c not in to_del) ]#and ('ENF' not in c)
dfFiltered = dfFiltered[cols]
print dfFiltered.shape
print dfFiltered.columns
In [666]:
to_del = [u'Unnamed: 0',"id","encounter_id","payer_code"]
#Filter_selected cols
cols = [c for c in dfFiltered.columns if (c not in to_del) ]#and ('ENF' not in c)
dfFiltered = dfFiltered[cols]
print dfFiltered.shape
print dfFiltered.columns
In [667]:
dfFiltered.insert(dfFiltered.shape[1], "External_causes_1",dfFiltered["External causes_1"].values)
dfFiltered = dfFiltered.loc[:,~dfFiltered.columns.isin([u"External causes_1"])]
dfFiltered.insert(dfFiltered.shape[1], "External_causes_3",dfFiltered["External causes_3"].values)
dfFiltered = dfFiltered.loc[:,~dfFiltered.columns.isin([u"External causes_3"])]
print dfFiltered.shape
print dfFiltered.columns
In [668]:
print dfFiltered.columns
cols = [c for c in dfFiltered.columns if c != "readmitted"]
cols.append("readmitted")
dfFiltered = dfFiltered[cols]
print dfFiltered.columns
print dfFiltered.shape
In [684]:
print typeEncounter
print dfFiltered.shape
print dfFiltered.columns.tolist()
dfFiltered.to_pickle(os.path.join('resources','clean_data_' + typeEncounter + '.pkl'))
In [ ]: