In [31]:
#TO RE-RUN
%reset -f
In [43]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from sklearn.metrics import pairwise_distances
from IPython.display import display, HTML
from operator import truediv
import pandas as pd
import time
import os
from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.3f}'.format
plt.style.use('classic')
%matplotlib inline
import sys
sys.path.insert(1, "../src/")
from TypeFeatImputer import TypeFeatImputer
In [113]:
import sys
sys.path.insert(1, "../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter
import MLpipeline as MLpipeline
import readmision_methods as rm
In [44]:
typeEncounter = "last" # ['first','last']
typeHypothesis = "early_readmission_vs_none" # ['all_readmisssion_vs_none','early_readmission_vs_none']
typeDataFeatures = "extended_extra_diag_3" # ["reduced","extended','extended_extra','extended_extra_diag_1','extended_extra_diag_3']
#Extended_Extra_diag_1 -> Extended extra columns with disease in diag_3
#Extended_Extra_diag_1 -> Extended extra columns with disease in diag_1
#Extended_Extra -> Extended extra of columns
#Extended -> Extended columns
#Reduced -> minimum set of columns
typeDataExperiment = "all" #["all", "disease"]
#all -> No filter disease column
#disease -> Filter/Remove disease column and keep only rows with diagnosis column == 1
In [114]:
#Load data
df_all = rm.load_data(typeEncounter, typeDataFeatures)
print "\nSHAPE:"
print df_all.shape
#Filter data by class
df_all = rm.filter_data_by_class(df_all, typeHypothesis)
print "\nSHAPE FILTERED:"
print df_all.shape
print "\nRows by class type:"
print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)
#Train & Test
X_train, X_test, y_train, y_test = MLpipeline.train_test_partition(df_all)
print "Train:", X_train.shape, "Test:", X_test.shape
In [115]:
dfNull = np.sum(df_all.isnull()) / float(df_all.shape[0])
dfNullAll = pd.concat([dfNull[dfNull>0],np.sum(df_all.isnull()),np.sum(df_all.isnull()==False)], axis=1)
dfNullAll.columns= ["%_Null","#_Null","#_No_Null"]
display(dfNullAll[dfNullAll.iloc[:,0].isnull() == False][["%_Null","#_Null","#_No_Null"]])
print dfNullAll[dfNullAll.iloc[:,0].isnull() == False].shape
plt.figure(figsize=(15,4))
plt.bar(range(dfNull.shape[0]),dfNull.iloc[:], alpha=0.5)
plt.xticks(range(dfNull.shape[0]),dfNull.index,rotation=90)
plt.ylabel("% nulls")
plt.tight_layout()
plt.show
Out[115]:
In [123]:
#Get features by type
catCols, reducedCols = rm.compute_type_features(df_all)
numCols = reducedCols[catCols == 0].values.tolist()
print len(numCols), numCols
In [124]:
df_all["add_home_no_dmed"] = -1
df_all["add_home_no_dmed"] = np.logical_and((df_all['diss_home']==1), (df_all["diabetesMed"] == 0))
print df_all["add_home_no_dmed"].value_counts()
df_all["prod_lab_med"] = -1
df_all["prod_lab_med"] = df_all["num_lab_procedures"] + df_all["num_medications"]
numCols.extend(["prod_lab_med"])
print len(numCols)
In [125]:
#Compute type fields
catCols = []
cols = df_all.columns
reducedCols = df_all.columns[:-1]
for i in range(len(cols)):
if cols[i] not in numCols:
catCols.append(1)
else:
catCols.append(0)
catCols = np.array(catCols)
print "Cat cols:", np.sum(catCols==1)
print "Num cols:", np.sum(catCols==0)
print len(reducedCols)
print len(cols)
In [135]:
total = 0
minFreq = df_all.shape[0] * 0.005
todel = []
print "MAX=", df_all.shape[0], "MIN:", minFreq
for c in df_all.columns[catCols == 1]:
numag = np.argmax(pd.value_counts(df_all[c]))
ix = [i for i in pd.value_counts(df_all[c]).index if i != numag]
#print ix
#print pd.value_counts(df_all[c]).iloc[ix].values
num = np.sum(pd.value_counts(df_all[c]).iloc[ix].values)
if num < minFreq:
total += 1
todel.append(c)
print
print len(df_all[c].unique()), numag, df_all[c].unique().tolist()
print pd.value_counts(df_all[c]).sort_values(0)
print "total low variance features:", total
print todel
Categorical
In [136]:
from sklearn.feature_selection import chi2
for cond in ["non_all", "non_early"]:
if cond == "non_all":
dfFilteredAux = df_all.copy()
classVal = dfFilteredAux["readmitted"]>0
else:
dfFilteredAux = df_all[df_all["readmitted"]<=1].copy()
classVal = dfFilteredAux["readmitted"]>0
catData = []
for rv in dfFilteredAux.columns[catCols == 1]:
cleanIc = dfFilteredAux[rv]
#Null policy (drop)
ix = cleanIc.dropna().index
#Get readmitted
#valClass = dfFiltered["readmitted"]>0
r, p = chi2(cleanIc.dropna().values.astype(int).reshape(-1,1),classVal[ix].astype(int).reshape(-1,1))
perc = (pd.value_counts(dfFilteredAux[dfFilteredAux["readmitted"] == 0][rv])/dfFilteredAux[dfFilteredAux["readmitted"] == 0][rv].count()).round(3).values.tolist()
perc_1 = (pd.value_counts(dfFilteredAux[dfFilteredAux["readmitted"] > 0][rv])/dfFilteredAux[dfFilteredAux["readmitted"] > 0][rv].count()).round(3).values.tolist()
catData.append([str(rv),
np.sum(cleanIc.isnull() == False),
np.sum(cleanIc.isnull() == True),
np.sum(cleanIc.isnull()) / float(len(cleanIc)),
pd.value_counts(cleanIc).index.astype(int).tolist(),
pd.value_counts(cleanIc).values.tolist(),
str(pd.value_counts(dfFilteredAux[dfFilteredAux["readmitted"] == 0][rv]).values.tolist()),
str(pd.value_counts(dfFilteredAux[dfFilteredAux["readmitted"] > 0][rv]).values.tolist()),
str(perc),
str(perc_1),
abs(perc_1[0] - perc[0]),
round(p,4),
"yes" if p<0.05 else "no"
])
dfCatData = pd.DataFrame(catData, columns=["variable","num","#nulls",u"%nulls","values","frequencies",
"non-readmitted (n)",
"readmitted (n)" if cond == "non_all" else "early_readmitted (n)",
"non-readmitted (%)",
"readmitted (%)" if cond == "non_all" else "early_readmitted (%)",
"diff (%)",
"p-value","sig."])
print dfCatData[dfCatData["p-value"] < 0.05].shape
display(HTML(dfCatData[np.logical_and(dfCatData["p-value"] < 0.05, dfCatData["diff (%)"] >= 0.01)].sort_values("diff (%)", ascending=False).to_html(index=False)))
Numerical
In [137]:
from scipy import stats
for cond in ["non_all", "non_early"]:
if cond == "non_all":
dfFilteredAux = df_all.copy()
classVal = dfFilteredAux["readmitted"]>0
else:
dfFilteredAux = df_all[df_all["readmitted"]<=1].copy()
classVal = dfFilteredAux["readmitted"]>0
numData = dfFilteredAux[dfFilteredAux.columns[catCols == 0]].describe().T[["count","mean","std"]].copy()
numData.insert(0,"variable",numData.index.values)
numData.insert(1,"num", dfFilteredAux.shape[0] - numData["count"].isnull().astype(int))
numData.insert(2,"#nulls", numData["count"].isnull().astype(int))
numData.insert(3,"%nulls", numData["#nulls"] / dfFilteredAux.shape[0])
numData_non_comp = dfFilteredAux[dfFilteredAux["readmitted"] == 0][dfFilteredAux.columns[catCols == 0]].describe().T[["mean","std"]]
numData_comp = dfFilteredAux[dfFilteredAux["readmitted"] > 0][dfFilteredAux.columns[catCols == 0]].describe().T[["mean","std"]]
numData["mean"] = numData["mean"].round(2).astype(str).str.cat(numData["std"].round(2).astype(str), sep="+/-")
numData["non-readmitted"] = numData_non_comp["mean"].round(3).astype(str).str.cat(numData_non_comp["std"].round(2).astype(str), sep="+/-")
numData["readmitted" if cond == "non_all" else "early_readmitted"] = numData_comp["mean"].round(3).astype(str).str.cat(numData_comp["std"].round(2).astype(str), sep="+/-")
diffRange = numData_non_comp["mean"].max() - numData_non_comp["mean"].min()
diffRange1 = numData_comp["mean"].max() - numData_comp["mean"].min()
numData["diff"] = abs(
((numData_non_comp["mean"] - numData_non_comp["mean"].min())/float(diffRange)) -
((numData_comp["mean"] - numData_comp["mean"].min())/float(diffRange1)))
pvals = []
for v in numData["variable"].values:
#pvals.append(stats.mannwhitneyu(dfFiltered[v].values.astype(float).reshape(-1,1),
# classVal.astype(int).reshape(-1,1),alternative='two-sided')[1])
pvals.append(stats.mannwhitneyu(
dfFilteredAux[v].loc[classVal == False].values.astype(float).reshape(-1,1),
dfFilteredAux[v].loc[classVal == True].values.astype(float).reshape(-1,1)
)[1])
numData["p-value"] = pvals
numData["sig"] = numData["p-value"] < 0.05
numData = numData[["variable","num","#nulls","%nulls",
"mean","non-readmitted",
"readmitted" if cond == "non_all" else "early_readmitted",
"diff","p-value","sig"]]
display(HTML(numData.sort_values("diff", ascending=False).to_html(index=False)))
In [139]:
arrNorm = []
for i,rv in enumerate(df_all.columns[:-1]):
#Compute data
cleanIc = df_all.iloc[:,i]
f_value, p_value = stats.normaltest(cleanIc)
arrNorm.append([rv,"yes" if catCols[i] == 1 else "no", f_value.round(4), p_value.round(4), "no" if p_value < 0.05 else "yes"])
dfNorm = pd.DataFrame(np.array(arrNorm), columns=["variable","categoric","f_val","p_value","norm"])
print dfNorm.shape
print "Normal features:", np.sum(dfNorm["norm"] == "yes")
print "Normal numerical features:", np.sum(np.logical_and(dfNorm["norm"] == "yes",dfNorm["categoric"] == "no"))
print "Normal categorical features:", np.sum(np.logical_and(dfNorm["norm"] == "yes",dfNorm["categoric"] == "yes"))
print "No Normal features:", np.sum(dfNorm["norm"] == "no")
print "No Normal numerical features:", np.sum(np.logical_and(dfNorm["norm"] == "no",dfNorm["categoric"] == "no"))
print "No Normal categorical features:", np.sum(np.logical_and(dfNorm["norm"] == "no",dfNorm["categoric"] == "yes"))
display(HTML(dfNorm[dfNorm["categoric"] == "no"][["variable","f_val","p_value","norm"]].to_html(index=False)))
In [141]:
from scipy.stats import boxcox
for i,rv in enumerate(df_all.columns[:-1]):
#Compute data
cleanIc = df_all.iloc[:,i]
if dfNorm[dfNorm.variable == rv][["categoric"]].values == "no":
f_value, p_value = stats.normaltest(cleanIc)
print rv
print "Unique values:", len(np.unique(cleanIc))
print "p-value:", p_value
print "Normal:", p_value >= 0.05
print "Mean:", np.mean(cleanIc), "std:", np.std(cleanIc)
print "Min:", np.min(cleanIc), "Max:", np.max(cleanIc)
plt.figure(figsize=(8,4))
plt.hist(cleanIc, bins=10, alpha=0.5)
plt.axvline(np.median(cleanIc), c="r", ls="--")
plt.show()
In [195]:
#Categorical
for i,rv in enumerate(dfFiltered.columns[:-1]):
#Compute data
cleanIc = dfFiltered.iloc[:,i]
if dfNorm[dfNorm.variable == rv][["categoric"]].values == "yes":
print rv
print pd.value_counts(cleanIc).shape
print pd.value_counts(cleanIc)
pd.value_counts(cleanIc).plot(kind="bar", alpha=0.5)
plt.show()
In [ ]: