In [16]:
#TO RE-RUN
%reset -f

In [17]:
# coding: utf-8
# In[110]:
from sklearn import preprocessing
from time import time
from sklearn.feature_extraction import DictVectorizer
import numpy as np
import csv
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_diabetes
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import silhouette_samples, silhouette_score
from operator import truediv
from sklearn.metrics import pairwise_distances
import pandas as pd
import os

%matplotlib inline

In [23]:
df_grouped=pd.read_csv(os.path.join('resources','diabetic_data_processed_2017_09_29.csv'),';')
print df_grouped.shape

range1=np.array(range(1, 139)).astype(str)
range2=np.array(range(140, 239)).astype(str)
range3=np.array(range(240, 279)).astype(str)
range4=np.array(range(280, 289)).astype(str)
range5=np.array(range(290, 319)).astype(str)
range6=np.array(range(320, 359)).astype(str)
range7=np.array(range(360, 389)).astype(str)
range8=np.array(range(390, 459)).astype(str)
range9=np.array(range(460, 519)).astype(str)
range10=np.array(range(520, 579)).astype(str)
range11=np.array(range(580, 629)).astype(str)
range12=np.array(range(630, 679)).astype(str)
range13=np.array(range(680, 709)).astype(str)
range14=np.array(range(710, 739)).astype(str)
range15=np.array(range(740, 759)).astype(str)
range16=np.array(range(760, 779)).astype(str)
range17=np.array(range(780, 799)).astype(str)
range18=np.array(range(800, 999)).astype(str)

ix = pd.value_counts(df_grouped['diag_1']).index
rangeD2 =  ix[ix.str.contains("250")]
print rangeD2


(101766, 50)
Index([u'250.8', u'250.6', u'250.7', u'250.13', u'250.02', u'250.11',
       u'250.12', u'250.82', u'250.1', u'250.4', u'250', u'250.03', u'250.81',
       u'250.22', u'250.2', u'250.83', u'250.41', u'250.42', u'250.01',
       u'250.92', u'250.23', u'250.43', u'250.3', u'250.33', u'250.93',
       u'250.32', u'250.31', u'250.21', u'250.5', u'250.9', u'250.91',
       u'250.53', u'250.52', u'250.51'],
      dtype='object')

In [24]:
def remap_diseases_3_diagnosis(column):
    
    d = []
    col = column
    print column
    
    if col == "Diabetis":
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(rangeD2), df_grouped["diag_2"].isin(rangeD2)),
                df_grouped["diag_3"].isin(rangeD2))].index)
        
    if col == "External causes":
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].str.upper().str.contains("E"), df_grouped["diag_2"].str.upper().str.contains("E")),
                df_grouped["diag_3"].str.upper().str.contains("E"))].index)
        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].str.upper().str.contains("V"), df_grouped["diag_2"].str.upper().str.contains("V")),
                df_grouped["diag_3"].str.upper().str.contains("V"))].index)        
       
    if col == "Infectious and parasitic diseases":      
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range1), df_grouped["diag_2"].isin(range1)),
                df_grouped["diag_3"].isin(range1))].index)

    if col == "Neoplasms":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range2), df_grouped["diag_2"].isin(range2)),
                df_grouped["diag_3"].isin(range2))].index)        

    if col == "Endocrine":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range3), df_grouped["diag_2"].isin(range3)),
                df_grouped["diag_3"].isin(range3))].index)        

    if col == "Blood":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range4), df_grouped["diag_2"].isin(range4)),
                df_grouped["diag_3"].isin(range4))].index)        

    if col == "Mental":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range5), df_grouped["diag_2"].isin(range5)),
                df_grouped["diag_3"].isin(range5))].index)        

    if col == "Nervous":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range6), df_grouped["diag_2"].isin(range6)),
                df_grouped["diag_3"].isin(range6))].index)        

    if col == "Organs":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range7), df_grouped["diag_2"].isin(range7)),
                df_grouped["diag_3"].isin(range7))].index)         

    if col == "Circulatory":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range8), df_grouped["diag_2"].isin(range8)),
                df_grouped["diag_3"].isin(range8))].index)        

    if col == "Respiratory":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range9), df_grouped["diag_2"].isin(range9)),
                df_grouped["diag_3"].isin(range9))].index)        

    if col == "Digestive":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range10), df_grouped["diag_2"].isin(range10)),
                df_grouped["diag_3"].isin(range10))].index)        

    if col == "Genitourinary":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range11), df_grouped["diag_2"].isin(range11)),
                df_grouped["diag_3"].isin(range11))].index) 

    if col == "Pregnancy":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range12), df_grouped["diag_2"].isin(range12)),
                df_grouped["diag_3"].isin(range12))].index)        

    if col == "Skin":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range13), df_grouped["diag_2"].isin(range13)),
                df_grouped["diag_3"].isin(range13))].index)        

    if col == "Muscoskeletal":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range14), df_grouped["diag_2"].isin(range14)),
                df_grouped["diag_3"].isin(range14))].index)        

    if col == "Congenital":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range15), df_grouped["diag_2"].isin(range15)),
                df_grouped["diag_3"].isin(range15))].index) 

    if col == "Perinatal":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range16), df_grouped["diag_2"].isin(range16)),
                df_grouped["diag_3"].isin(range16))].index)        

    if col == "Ill-defined":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range17), df_grouped["diag_2"].isin(range17)),
                df_grouped["diag_3"].isin(range17))].index)        

    if col == "Poisoning":        
        d.extend(
            df_grouped[np.logical_or(
                np.logical_or(df_grouped["diag_1"].isin(range18), df_grouped["diag_2"].isin(range18)),
                df_grouped["diag_3"].isin(range18))].index) 
        
    return [col + "_3",d]

In [25]:
def remap_diseases_1_diagnosis(column):
    
    d = []
    col = column
    print column
    
    if col == "Diabetis":
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(rangeD2)].index)
        
    if col == "External causes":
        d.extend(
            df_grouped[df_grouped["diag_1"].str.upper().str.contains("E")].index)
        
        d.extend(
            df_grouped[df_grouped["diag_1"].str.upper().str.contains("V")].index)        
       
    if col == "Infectious and parasitic diseases":      
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range1)].index)

    if col == "Neoplasms":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range2)].index)        

    if col == "Endocrine":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range3)].index)        

    if col == "Blood":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range4)].index)        

    if col == "Mental":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range5)].index)        

    if col == "Nervous":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range6)].index)        

    if col == "Organs":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range7)].index)         

    if col == "Circulatory":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range8)].index)        

    if col == "Respiratory":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range9)].index)        

    if col == "Digestive":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range10)].index)        

    if col == "Genitourinary":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range11)].index) 

    if col == "Pregnancy":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range12)].index)        

    if col == "Skin":        
        d.extend(
             df_grouped[df_grouped["diag_1"].isin(range13)].index)        

    if col == "Muscoskeletal":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range14)].index)        

    if col == "Congenital":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range15)].index) 

    if col == "Perinatal":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range16)].index)        

    if col == "Ill-defined":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range17)].index)        

    if col == "Poisoning":        
        d.extend(
            df_grouped[df_grouped["diag_1"].isin(range18)].index) 
        
    return [col + "_1",d]

In [26]:
from multiprocessing import Pool
pool = Pool(4)

#Compute new columns
ix = []
ix = pool.map(remap_diseases_3_diagnosis,["Diabetis","External causes","Infectious and parasitic diseases",
"Neoplasms","Endocrine","Blood","Mental","Nervous","Organs","Circulatory",
"Respiratory","Digestive","Genitourinary","Pregnancy","Skin",
"Muscoskeletal","Congenital","Perinatal","Ill-defined","Poisoning"])

#Add new columns
for i in range(len(ix)):
    df_grouped[ix[i][0]]=0
    df_grouped[ix[i][0]].ix[ix[i][1]]=1
    
    print ix[i][0]
    print pd.value_counts(df_grouped[ix[i][0]])
    
#Filter_selected cols
to_del = ['diag_1','diag_2','diag_3']        
filtered_cols = [c for c in df_grouped.columns if (c not in to_del) ]#and ('ENF' not in c)
df_2 = df_grouped[filtered_cols]
print ("df_2",df_2.shape)


Infectious and parasitic diseases
Diabetis
Mental
Endocrine
External causes
Neoplasms
Nervous
Blood
Organs
Circulatory
Respiratory
Digestive
Genitourinary
Pregnancy
Skin
Muscoskeletal
Congenital
Perinatal
Ill-defined
Poisoning
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:14: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
Diabetis_3
0    63742
1    38024
Name: Diabetis_3, dtype: int64
External causes_3
0    93297
1     8469
Name: External causes_3, dtype: int64
Infectious and parasitic diseases_3
0    95437
1     6329
Name: Infectious and parasitic diseases_3, dtype: int64
Neoplasms_3
0    95715
1     6051
Name: Neoplasms_3, dtype: int64
Endocrine_3
0    67774
1    33992
Name: Endocrine_3, dtype: int64
Blood_3
0    95531
1     6235
Name: Blood_3, dtype: int64
Mental_3
0    95209
1     6557
Name: Mental_3, dtype: int64
Nervous_3
0    98335
1     3431
Name: Nervous_3, dtype: int64
Organs_3
0    101148
1       618
Name: Organs_3, dtype: int64
Circulatory_3
1    58520
0    43246
Name: Circulatory_3, dtype: int64
Respiratory_3
0    79301
1    22465
Name: Respiratory_3, dtype: int64
Digestive_3
0    88140
1    13626
Name: Digestive_3, dtype: int64
Genitourinary_3
0    84456
1    17310
Name: Genitourinary_3, dtype: int64
Pregnancy_3
0    101074
1       692
Name: Pregnancy_3, dtype: int64
Skin_3
0    94443
1     7323
Name: Skin_3, dtype: int64
Muscoskeletal_3
0    94234
1     7532
Name: Muscoskeletal_3, dtype: int64
Congenital_3
0    101527
1       239
Name: Congenital_3, dtype: int64
Perinatal_3
0    101766
Name: Perinatal_3, dtype: int64
Ill-defined_3
0    86915
1    14851
Name: Ill-defined_3, dtype: int64
Poisoning_3
0    91872
1     9894
Name: Poisoning_3, dtype: int64
('df_2', (101766, 67))

In [27]:
from multiprocessing import Pool
pool = Pool(4)

#Compute new columns
ix = []
ix = pool.map(remap_diseases_1_diagnosis,["Diabetis","External causes","Infectious and parasitic diseases",
"Neoplasms","Endocrine","Blood","Mental","Nervous","Organs","Circulatory",
"Respiratory","Digestive","Genitourinary","Pregnancy","Skin",
"Muscoskeletal","Congenital","Perinatal","Ill-defined","Poisoning"])

#Add new columns
for i in range(len(ix)):
    df_2[ix[i][0]]=0
    df_2[ix[i][0]].ix[ix[i][1]]=1
    
    print ix[i][0]
    print pd.value_counts(df_2[ix[i][0]])
    
#Filter_selected cols
to_del = ['diag_1','diag_2','diag_3']        
filtered_cols = [c for c in df_2.columns if (c not in to_del) ]#and ('ENF' not in c)
df_2 = df_2[filtered_cols]
print ("df_2",df_2.shape)


Endocrine
Diabetis
Infectious and parasitic diseases
Mental
Neoplasms
Blood
Nervous
External causes
Organs
Circulatory
Respiratory
Digestive
Genitourinary
Pregnancy
Skin
Muscoskeletal
Congenital
Perinatal
Ill-defined
Poisoning
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:14: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
/home/ilmira/.conda/envs/readmision/lib/python2.7/site-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
Diabetis_1
0    93009
1     8757
Name: Diabetis_1, dtype: int64
External causes_1
0    100121
1      1645
Name: External causes_1, dtype: int64
Infectious and parasitic diseases_1
0    98998
1     2768
Name: Infectious and parasitic diseases_1, dtype: int64
Neoplasms_1
0    98366
1     3400
Name: Neoplasms_1, dtype: int64
Endocrine_1
0    98831
1     2935
Name: Endocrine_1, dtype: int64
Blood_1
0    100682
1      1084
Name: Blood_1, dtype: int64
Mental_1
0    99504
1     2262
Name: Mental_1, dtype: int64
Nervous_1
0    100827
1       939
Name: Nervous_1, dtype: int64
Organs_1
0    101503
1       263
Name: Organs_1, dtype: int64
Circulatory_1
0    71527
1    30239
Name: Circulatory_1, dtype: int64
Respiratory_1
0    91405
1    10361
Name: Respiratory_1, dtype: int64
Digestive_1
0    92565
1     9201
Name: Digestive_1, dtype: int64
Genitourinary_1
0    96688
1     5078
Name: Genitourinary_1, dtype: int64
Pregnancy_1
0    101079
1       687
Name: Pregnancy_1, dtype: int64
Skin_1
0    99244
1     2522
Name: Skin_1, dtype: int64
Muscoskeletal_1
0    96809
1     4957
Name: Muscoskeletal_1, dtype: int64
Congenital_1
0    101720
1        46
Name: Congenital_1, dtype: int64
Perinatal_1
0    101766
Name: Perinatal_1, dtype: int64
Ill-defined_1
0    94198
1     7568
Name: Ill-defined_1, dtype: int64
Poisoning_1
0    94821
1     6945
Name: Poisoning_1, dtype: int64
('df_2', (101766, 87))

In [28]:
#Save
df_2.to_csv(os.path.join('resources',"diabetic_data_extended_2017_09_29_14_xrp.csv"))

In [ ]: