In [16]:
#TO RE-RUN
%reset -f
In [17]:
# coding: utf-8
# In[110]:
from sklearn import preprocessing
from time import time
from sklearn.feature_extraction import DictVectorizer
import numpy as np
import csv
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_diabetes
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import silhouette_samples, silhouette_score
from operator import truediv
from sklearn.metrics import pairwise_distances
import pandas as pd
import os
%matplotlib inline
In [23]:
df_grouped=pd.read_csv(os.path.join('resources','diabetic_data_processed_2017_09_29.csv'),';')
print df_grouped.shape
range1=np.array(range(1, 139)).astype(str)
range2=np.array(range(140, 239)).astype(str)
range3=np.array(range(240, 279)).astype(str)
range4=np.array(range(280, 289)).astype(str)
range5=np.array(range(290, 319)).astype(str)
range6=np.array(range(320, 359)).astype(str)
range7=np.array(range(360, 389)).astype(str)
range8=np.array(range(390, 459)).astype(str)
range9=np.array(range(460, 519)).astype(str)
range10=np.array(range(520, 579)).astype(str)
range11=np.array(range(580, 629)).astype(str)
range12=np.array(range(630, 679)).astype(str)
range13=np.array(range(680, 709)).astype(str)
range14=np.array(range(710, 739)).astype(str)
range15=np.array(range(740, 759)).astype(str)
range16=np.array(range(760, 779)).astype(str)
range17=np.array(range(780, 799)).astype(str)
range18=np.array(range(800, 999)).astype(str)
ix = pd.value_counts(df_grouped['diag_1']).index
rangeD2 = ix[ix.str.contains("250")]
print rangeD2
In [24]:
def remap_diseases_3_diagnosis(column):
d = []
col = column
print column
if col == "Diabetis":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(rangeD2), df_grouped["diag_2"].isin(rangeD2)),
df_grouped["diag_3"].isin(rangeD2))].index)
if col == "External causes":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].str.upper().str.contains("E"), df_grouped["diag_2"].str.upper().str.contains("E")),
df_grouped["diag_3"].str.upper().str.contains("E"))].index)
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].str.upper().str.contains("V"), df_grouped["diag_2"].str.upper().str.contains("V")),
df_grouped["diag_3"].str.upper().str.contains("V"))].index)
if col == "Infectious and parasitic diseases":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range1), df_grouped["diag_2"].isin(range1)),
df_grouped["diag_3"].isin(range1))].index)
if col == "Neoplasms":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range2), df_grouped["diag_2"].isin(range2)),
df_grouped["diag_3"].isin(range2))].index)
if col == "Endocrine":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range3), df_grouped["diag_2"].isin(range3)),
df_grouped["diag_3"].isin(range3))].index)
if col == "Blood":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range4), df_grouped["diag_2"].isin(range4)),
df_grouped["diag_3"].isin(range4))].index)
if col == "Mental":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range5), df_grouped["diag_2"].isin(range5)),
df_grouped["diag_3"].isin(range5))].index)
if col == "Nervous":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range6), df_grouped["diag_2"].isin(range6)),
df_grouped["diag_3"].isin(range6))].index)
if col == "Organs":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range7), df_grouped["diag_2"].isin(range7)),
df_grouped["diag_3"].isin(range7))].index)
if col == "Circulatory":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range8), df_grouped["diag_2"].isin(range8)),
df_grouped["diag_3"].isin(range8))].index)
if col == "Respiratory":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range9), df_grouped["diag_2"].isin(range9)),
df_grouped["diag_3"].isin(range9))].index)
if col == "Digestive":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range10), df_grouped["diag_2"].isin(range10)),
df_grouped["diag_3"].isin(range10))].index)
if col == "Genitourinary":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range11), df_grouped["diag_2"].isin(range11)),
df_grouped["diag_3"].isin(range11))].index)
if col == "Pregnancy":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range12), df_grouped["diag_2"].isin(range12)),
df_grouped["diag_3"].isin(range12))].index)
if col == "Skin":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range13), df_grouped["diag_2"].isin(range13)),
df_grouped["diag_3"].isin(range13))].index)
if col == "Muscoskeletal":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range14), df_grouped["diag_2"].isin(range14)),
df_grouped["diag_3"].isin(range14))].index)
if col == "Congenital":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range15), df_grouped["diag_2"].isin(range15)),
df_grouped["diag_3"].isin(range15))].index)
if col == "Perinatal":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range16), df_grouped["diag_2"].isin(range16)),
df_grouped["diag_3"].isin(range16))].index)
if col == "Ill-defined":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range17), df_grouped["diag_2"].isin(range17)),
df_grouped["diag_3"].isin(range17))].index)
if col == "Poisoning":
d.extend(
df_grouped[np.logical_or(
np.logical_or(df_grouped["diag_1"].isin(range18), df_grouped["diag_2"].isin(range18)),
df_grouped["diag_3"].isin(range18))].index)
return [col + "_3",d]
In [25]:
def remap_diseases_1_diagnosis(column):
d = []
col = column
print column
if col == "Diabetis":
d.extend(
df_grouped[df_grouped["diag_1"].isin(rangeD2)].index)
if col == "External causes":
d.extend(
df_grouped[df_grouped["diag_1"].str.upper().str.contains("E")].index)
d.extend(
df_grouped[df_grouped["diag_1"].str.upper().str.contains("V")].index)
if col == "Infectious and parasitic diseases":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range1)].index)
if col == "Neoplasms":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range2)].index)
if col == "Endocrine":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range3)].index)
if col == "Blood":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range4)].index)
if col == "Mental":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range5)].index)
if col == "Nervous":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range6)].index)
if col == "Organs":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range7)].index)
if col == "Circulatory":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range8)].index)
if col == "Respiratory":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range9)].index)
if col == "Digestive":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range10)].index)
if col == "Genitourinary":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range11)].index)
if col == "Pregnancy":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range12)].index)
if col == "Skin":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range13)].index)
if col == "Muscoskeletal":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range14)].index)
if col == "Congenital":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range15)].index)
if col == "Perinatal":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range16)].index)
if col == "Ill-defined":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range17)].index)
if col == "Poisoning":
d.extend(
df_grouped[df_grouped["diag_1"].isin(range18)].index)
return [col + "_1",d]
In [26]:
from multiprocessing import Pool
pool = Pool(4)
#Compute new columns
ix = []
ix = pool.map(remap_diseases_3_diagnosis,["Diabetis","External causes","Infectious and parasitic diseases",
"Neoplasms","Endocrine","Blood","Mental","Nervous","Organs","Circulatory",
"Respiratory","Digestive","Genitourinary","Pregnancy","Skin",
"Muscoskeletal","Congenital","Perinatal","Ill-defined","Poisoning"])
#Add new columns
for i in range(len(ix)):
df_grouped[ix[i][0]]=0
df_grouped[ix[i][0]].ix[ix[i][1]]=1
print ix[i][0]
print pd.value_counts(df_grouped[ix[i][0]])
#Filter_selected cols
to_del = ['diag_1','diag_2','diag_3']
filtered_cols = [c for c in df_grouped.columns if (c not in to_del) ]#and ('ENF' not in c)
df_2 = df_grouped[filtered_cols]
print ("df_2",df_2.shape)
In [27]:
from multiprocessing import Pool
pool = Pool(4)
#Compute new columns
ix = []
ix = pool.map(remap_diseases_1_diagnosis,["Diabetis","External causes","Infectious and parasitic diseases",
"Neoplasms","Endocrine","Blood","Mental","Nervous","Organs","Circulatory",
"Respiratory","Digestive","Genitourinary","Pregnancy","Skin",
"Muscoskeletal","Congenital","Perinatal","Ill-defined","Poisoning"])
#Add new columns
for i in range(len(ix)):
df_2[ix[i][0]]=0
df_2[ix[i][0]].ix[ix[i][1]]=1
print ix[i][0]
print pd.value_counts(df_2[ix[i][0]])
#Filter_selected cols
to_del = ['diag_1','diag_2','diag_3']
filtered_cols = [c for c in df_2.columns if (c not in to_del) ]#and ('ENF' not in c)
df_2 = df_2[filtered_cols]
print ("df_2",df_2.shape)
In [28]:
#Save
df_2.to_csv(os.path.join('resources',"diabetic_data_extended_2017_09_29_14_xrp.csv"))
In [ ]: