In [1]:
import reader
import numpy as np
import pandas as pd
import os

First read in the original data


In [2]:
import re
data = pd.read_pickle(os.getcwd() + '/data/all_encounter_data.pickle')

repeat the processing with all_encounter_data in ICO.py


In [3]:
d_enc = data.drop(["Enc_ID","Person_ID"], axis=1)

pattern0= re.compile("\d+\s*\/\s*\d+")
index1 = d_enc['Glucose'].str.contains(pattern0, na=False)
temp = d_enc.loc[index1, 'Glucose']
d_enc.loc[index1, 'Glucose'] = d_enc.loc[index1, 'BP']
d_enc.loc[index1, 'BP'] = temp

index2 = d_enc.BP[d_enc.BP.notnull()][~d_enc.BP[d_enc.BP.notnull()].str.contains('/')].index
temp = d_enc.loc[index2, 'Glucose']
d_enc.loc[index2, 'Glucose'] = d_enc.loc[index2, 'BP']
d_enc.loc[index2, 'BP'] = temp

# Split up the BP field into Systolic and Diastolic readings
pattern1 = re.compile("(?P<BP_Systolic>\d+)\s*\/\s*(?P<BP_Diastolic>\d+)")
d_enc = pd.merge(d_enc, d_enc["BP"].str.extract(pattern1, expand=True),
                 left_index=True, right_index=True).drop("BP", axis=1)

# Define ranges for reasonable values. Identify the data outside of 1.5 times of IQR as outliers
NaN = float("NaN")
quantitive_columns=['A1C', 'BMI', 'Glucose', 'BP_Diastolic', 'BP_Systolic']

In [4]:
for column in quantitive_columns:
    d_enc[column] = pd.to_numeric(d_enc[column], errors='coerce')
    temp = d_enc[column][d_enc[column].notnull()]
    
    Q2 = temp.quantile(0.75)
    Q1 = temp.quantile(0.25)
    IQR = Q2-Q1
        
    print(temp[Q1 - 2 * IQR < temp][temp[Q1 - 2 * IQR < temp] < Q2 + 2 * IQR].shape[0], temp.shape[0], d_enc.shape[0])
    print(column, Q1 - 2 * IQR, Q2 + 2 * IQR)


(11850, 12793, 82432)
('A1C', 2.6000000000000005, 11.6)
(21888, 22198, 82432)
('BMI', 8.319999999999993, 55.29500000000001)
(29150, 30461, 82432)
('Glucose', 16.0, 246.0)
(37922, 38226, 82432)
('BP_Diastolic', 40.0, 115.0)
(37758, 38226, 82432)
('BP_Systolic', 74.0, 194.0)

In [5]:
for column in quantitive_columns:
    d_enc[column] = pd.to_numeric(d_enc[column], errors='coerce')
    temp = d_enc[column][d_enc[column].notnull()]
    
    Q2 = temp.quantile(0.75)
    Q1 = temp.quantile(0.25)
    IQR = Q2-Q1
        
    print(temp[Q1 - 1.5 * IQR < temp][temp[Q1 - 1.5 * IQR < temp] < Q2 + 1.5 * IQR].shape[0], temp.shape[0], d_enc.shape[0])
    print(column, Q1 - 1.5 * IQR, Q2 + 1.5 * IQR)


(11467, 12793, 82432)
('A1C', 3.5000000000000004, 10.7)
(21566, 22198, 82432)
('BMI', 13.017499999999995, 50.59750000000001)
(28517, 30461, 82432)
('Glucose', 39.0, 223.0)
(37383, 38226, 82432)
('BP_Diastolic', 47.5, 107.5)
(37164, 38226, 82432)
('BP_Systolic', 86.0, 182.0)

After setting up the standard range of outliers, we lost at most 600 points for each variable in all_encounter_data. (And set the times of IQR from 1.5 to 2 does not really remain as many points as I was expecting. So I choose the standard times of 1.5)

And after removing the outliers, the variables seem a lot more normal distributed.


In [6]:
import matplotlib.pyplot as plt
for column in quantitive_columns:
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

    temp0 = pd.to_numeric(d_enc[column], errors='coerce')
    ax1.hist(temp0[temp0.notnull()])
    ax1.set_title('before')
    
    
    temp = temp0[temp0.notnull()]
    
    Q2 = temp.quantile(0.75)
    Q1 = temp.quantile(0.25)
    IQR = Q2-Q1
        
    d_enc[column] = temp0.map(lambda x: x if Q1 - 1.5 * IQR < x < Q2 + 1.5 * IQR else NaN)
    ax2.hist(d_enc[column][d_enc[column].notnull()])
    ax2.set_title('after')
    
    f.suptitle(column)
    
    
plt.show()


What about we crush all_encounter_data to all_person_data by group Person_Nbr?


In [7]:
person_data_old = pd.read_pickle(os.getcwd() + '/data/all_person_data_Richard_20170307.pickle')
person_data_new = pd.read_pickle(os.getcwd() + '/data/all_person_data_Dan_20170406.pickle')

In [8]:
person_data_old[quantitive_columns].isnull().sum(axis=0)/person_data_old.shape[0]


Out[8]:
A1C             0.683896
BMI             0.584450
Glucose         0.329509
BP_Diastolic    0.091714
BP_Systolic     0.091839
dtype: float64

In [9]:
person_data_new[quantitive_columns].isnull().sum(axis=0)/person_data_new.shape[0]


Out[9]:
A1C             0.706465
BMI             0.596421
Glucose         0.365422
BP_Diastolic    0.103809
BP_Systolic     0.104807
dtype: float64

From above we can tell, after I identify the outliers, no more than 4% points of each variable are removed, which is acceptable for me.


In [10]:
plt.bar(range(0,5),
        person_data_new[quantitive_columns].isnull().sum(axis=0)/person_data_new.shape[0])
plt.gca().set_ylim([0,1])
plt.xticks(range(0,5), quantitive_columns)
plt.ylabel('Missing value percentage')
plt.xlabel('Quantative variables')
plt.show()


Now use 04/15 processed person data (added new features)


In [3]:
person_data_new = pd.read_pickle(os.getcwd() + '/data/all_person_data_Dan_20170415.pickle')

In [5]:
person_data_new.columns.values


Out[5]:
array(['DOB', 'Gender', 'Race', 'A1C', 'BMI', 'Glucose', 'BP_Systolic',
       'BP_Diastolic', 'MR_OD_SPH_Numeric', 'MR_OD_CYL_Numeric',
       'MR_OS_SPH_Numeric', 'MR_OS_CYL_Numeric', 'MR_OS_DVA_ability',
       'MR_OD_DVA_ability', 'MR_OS_NVA_ability', 'MR_OD_NVA_ability',
       'Last_Encounter', 'recent_smoking_status', 'family_DM', 'family_G',
       'DM', 'ME', 'MNPDR', 'PDR', 'SNPDR', 'mNPDR', 'Glaucoma_Suspect',
       'Open_angle_Glaucoma', 'Cataract', 'worst_DR', 'recent_DR', 'Age'], dtype=object)

In [17]:
quantitive_columns = ["A1C", "BMI", "Glucose", "BP_Systolic", "BP_Diastolic",
                        'MR_OD_SPH_Numeric', 'MR_OD_CYL_Numeric',
                        'MR_OS_SPH_Numeric', 'MR_OS_CYL_Numeric',
                        'MR_OS_DVA_ability', 'MR_OD_DVA_ability',
                        'MR_OS_NVA_ability', 'MR_OD_NVA_ability']

Get the dummy value for the categorical features


In [9]:
dummy_columns = ['DM', 'ME', 'Glaucoma_Suspect', 'Open_angle_Glaucoma', 'Cataract']
categorical_columns = ['Gender', 'Race', 'recent_smoking_status', 'family_DM', 'family_G']
for column in categorical_columns:
    temp = pd.get_dummies(person_data_new[column], prefix=column)
    person_data_new[temp.columns.values]=temp
    dummy_columns.extend(temp.columns.values.tolist())

Group the quantitive features by Age and Gender

But there are still null values within the grouped mean values.


In [12]:
temp = person_data_new.copy()
mean_value = temp.groupby(['Gender', pd.cut(temp['Age'], 6)]).apply(
    lambda x: x['A1C'][x['A1C'].notnull()].mean())
missing_index = temp.groupby(['Gender', pd.cut(temp['Age'], 6)]).apply(
    lambda x: x['A1C'][x['A1C'].isnull()])
for i in mean_value.index.to_series().tolist():
    if i in missing_index.index:
        temp.set_value(missing_index[i].index, 'A1C', mean_value[i])

In [13]:
mean_value


Out[13]:
Gender  Age         
F       (17.904, 34]    7.539483
        (34, 50]        7.473878
        (50, 66]        7.292831
        (66, 82]        6.949700
        (82, 98]        6.800847
        (98, 114]            NaN
M       (17.904, 34]    7.542628
        (34, 50]        7.567414
        (50, 66]        7.248448
        (66, 82]        7.042823
        (82, 98]        6.968860
        (98, 114]            NaN
U       (17.904, 34]         NaN
        (34, 50]             NaN
        (50, 66]        8.500000
        (66, 82]             NaN
dtype: float64

In [14]:
temp[temp['A1C'].isnull()].shape[0]


Out[14]:
14

Group the quantitive features by Age_group (and maybe gender)

Divide patients into groups with same amount of patients by age and created a new column called Age_group

Divide patients into groups by quantile of age and get dummy values


In [10]:
age_group = np.array([person_data_new.Age.quantile(1.0/6*i) for i in range(1,7)])

In [11]:
age_group


Out[11]:
array([  49.,   56.,   61.,   67.,   75.,  114.])

In [12]:
person_data_new['Age_group_numeric']=person_data_new.Age.apply(lambda x: sum(age_group<x)+1)

In [13]:
age_group_dict = {1: '(18, 48]', 2: '(49, 55]', 3: '(56, 60]', 4: '(61, 66]', 5: '(67, 74]', 6: '(75, 114]'}
person_data_new['Age_group'] = person_data_new.Age_group_numeric.apply(lambda x: age_group_dict.get(x))

In [14]:
person_data_new.groupby('Age_group').apply(lambda x: x.shape[0])


Out[14]:
Age_group
(18, 48]     2882
(49, 55]     2779
(56, 60]     2465
(61, 66]     2723
(67, 74]     2544
(75, 114]    2646
dtype: int64

In [15]:
temp = pd.get_dummies(person_data_new['Age_group'], prefix = 'Age_group')
person_data_new[temp.columns.values] = temp
dummy_columns.extend(temp.columns.values.tolist())

The missing value percentage in different age group as following:


In [18]:
person_data_new.groupby('Age_group').apply(lambda x: x[quantitive_columns].isnull().sum(axis=0)/x.shape[0])


Out[18]:
A1C BMI Glucose BP_Systolic BP_Diastolic MR_OD_SPH_Numeric MR_OD_CYL_Numeric MR_OS_SPH_Numeric MR_OS_CYL_Numeric MR_OS_DVA_ability MR_OD_DVA_ability MR_OS_NVA_ability MR_OD_NVA_ability
Age_group
(18, 48] 0.680083 0.553435 0.416724 0.121096 0.134282 0.348369 0.425052 0.354268 0.419153 0.260930 0.252255 0.540597 0.534004
(49, 55] 0.682979 0.565671 0.374955 0.093559 0.095358 0.309104 0.377474 0.309824 0.403023 0.258006 0.254408 0.423893 0.421375
(56, 60] 0.690061 0.595538 0.367546 0.090872 0.088438 0.305071 0.391481 0.312373 0.414199 0.281947 0.270588 0.404462 0.400000
(61, 66] 0.675725 0.602277 0.354021 0.091811 0.088138 0.297833 0.401763 0.303709 0.408373 0.324642 0.316930 0.412413 0.408006
(67, 74] 0.704009 0.648978 0.309748 0.104953 0.097091 0.347484 0.438286 0.347484 0.452437 0.408805 0.404481 0.448506 0.451651
(75, 114] 0.809146 0.620181 0.362812 0.125094 0.116402 0.410053 0.504535 0.417234 0.503023 0.563870 0.545729 0.602419 0.598639

Implement ANOVA test

To test if the mean values of the each feature are equal after being grouped


In [19]:
from scipy.stats import f_oneway

In [20]:
for column in quantitive_columns:
    temp = {k:list(v[column]) for k,v in person_data_new[person_data_new[column].notnull()].groupby('Age_group_numeric')}
    print column
    print f_oneway(temp[1], temp[2], temp[3], temp[4], temp[5], temp[6])


A1C
F_onewayResult(statistic=26.371771578908884, pvalue=2.2829101727302631e-26)
BMI
F_onewayResult(statistic=78.411159445800152, pvalue=4.2403569817472176e-80)
Glucose
F_onewayResult(statistic=35.123917937562318, pvalue=9.5112179417179683e-36)
BP_Systolic
F_onewayResult(statistic=49.171108345538109, pvalue=1.1760769627946283e-50)
BP_Diastolic
F_onewayResult(statistic=176.9193371566318, pvalue=2.5176503822653756e-183)
MR_OD_SPH_Numeric
F_onewayResult(statistic=217.87164565083827, pvalue=5.1387782598269752e-222)
MR_OD_CYL_Numeric
F_onewayResult(statistic=26.458983199181276, pvalue=1.2130687141502503e-26)
MR_OS_SPH_Numeric
F_onewayResult(statistic=209.12983449672444, pvalue=2.3935964880082328e-213)
MR_OS_CYL_Numeric
F_onewayResult(statistic=16.589865164327836, pvalue=2.4119943621839587e-16)
MR_OS_DVA_ability
F_onewayResult(statistic=200.40077064845303, pvalue=1.2363858466299997e-204)
MR_OD_DVA_ability
F_onewayResult(statistic=209.63854412824898, pvalue=7.0787069134924694e-214)
MR_OS_NVA_ability
F_onewayResult(statistic=122.47698550689915, pvalue=1.5354057497370595e-125)
MR_OD_NVA_ability
F_onewayResult(statistic=111.11095477280593, pvalue=4.4419469543124147e-114)

In [21]:
for column in quantitive_columns:
    temp = {k:list(v[column]) for k,v in person_data_new[person_data_new[column].notnull()].groupby('Gender')}
    print column
    print f_oneway(temp['F'], temp['M'])


A1C
F_onewayResult(statistic=1.4508634183063212, pvalue=0.22845005980287483)
BMI
F_onewayResult(statistic=179.51062957351098, pvalue=2.1345994906385521e-40)
Glucose
F_onewayResult(statistic=27.152720231647105, pvalue=1.9168366445098947e-07)
BP_Systolic
F_onewayResult(statistic=0.13518260552612232, pvalue=0.71312333853157739)
BP_Diastolic
F_onewayResult(statistic=66.008229158879161, pvalue=4.8540258014637313e-16)
MR_OD_SPH_Numeric
F_onewayResult(statistic=19.330433080545003, pvalue=1.1100753790602396e-05)
MR_OD_CYL_Numeric
F_onewayResult(statistic=2.4063093644301552, pvalue=0.12088118750659965)
MR_OS_SPH_Numeric
F_onewayResult(statistic=28.118727566825086, pvalue=1.1640004134418013e-07)
MR_OS_CYL_Numeric
F_onewayResult(statistic=7.2251013605662795, pvalue=0.0072022074879901322)
MR_OS_DVA_ability
F_onewayResult(statistic=10.500953735185268, pvalue=0.0011968627228390253)
MR_OD_DVA_ability
F_onewayResult(statistic=4.8160175526195905, pvalue=0.02821804091505874)
MR_OS_NVA_ability
F_onewayResult(statistic=4.8402445597134838, pvalue=0.027829941996177231)
MR_OD_NVA_ability
F_onewayResult(statistic=0.2505767682376332, pvalue=0.61668221786648758)

Fill up the missing values

with the mean of the non-null variable grouped by age group


In [22]:
person_data_fillup = {}

In [23]:
temp = person_data_new.copy()
for column in quantitive_columns:
    mean_value = temp.groupby('Age_group').apply(
        lambda x: x[column][x[column].notnull()].mean())
    missing_index = temp.groupby('Age_group').apply(
        lambda x: x[column][x[column].isnull()])
    for i in mean_value.index.to_series().tolist():
        if i in missing_index.index:
            temp.set_value(missing_index[i].index, column, mean_value[i])
person_data_fillup['groupbyAgegroup_mean'] = temp

Group the quantitive features by DR diagnosis

The missing value percentage in different diagnosis as following:


In [24]:
person_data_new.groupby('recent_DR').apply(lambda x: x.shape[0])


Out[24]:
recent_DR
MNPDR      654
PDR        964
SNPDR      198
mNPDR     2214
no_DR    12009
dtype: int64

In [25]:
person_data_new.groupby('recent_DR').apply(lambda x: x[quantitive_columns].isnull().sum(axis=0)/x.shape[0])


Out[25]:
A1C BMI Glucose BP_Systolic BP_Diastolic MR_OD_SPH_Numeric MR_OD_CYL_Numeric MR_OS_SPH_Numeric MR_OS_CYL_Numeric MR_OS_DVA_ability MR_OD_DVA_ability MR_OS_NVA_ability MR_OD_NVA_ability
recent_DR
MNPDR 0.631498 0.524465 0.249235 0.064220 0.059633 0.327217 0.441896 0.324159 0.415902 0.362385 0.382263 0.475535 0.504587
PDR 0.644191 0.581950 0.221992 0.134855 0.125519 0.502075 0.572614 0.509336 0.604772 0.703320 0.692946 0.727178 0.731328
SNPDR 0.712121 0.505051 0.303030 0.075758 0.085859 0.459596 0.525253 0.469697 0.500000 0.560606 0.540404 0.646465 0.626263
mNPDR 0.645438 0.615628 0.277326 0.070912 0.065944 0.313008 0.413279 0.324300 0.415086 0.358175 0.341012 0.456188 0.451220
no_DR 0.726705 0.599550 0.400533 0.111333 0.111750 0.325839 0.409943 0.329170 0.422183 0.313182 0.304522 0.453077 0.448164

In [26]:
person_data_new.groupby('worst_DR').apply(lambda x: x.shape[0])


Out[26]:
worst_DR
MNPDR      711
PDR       1040
SNPDR      224
mNPDR     2055
no_DR    12009
dtype: int64

In [27]:
person_data_new.groupby('worst_DR').apply(lambda x: x[quantitive_columns].isnull().sum(axis=0)/x.shape[0])


Out[27]:
A1C BMI Glucose BP_Systolic BP_Diastolic MR_OD_SPH_Numeric MR_OD_CYL_Numeric MR_OS_SPH_Numeric MR_OS_CYL_Numeric MR_OS_DVA_ability MR_OD_DVA_ability MR_OS_NVA_ability MR_OD_NVA_ability
worst_DR
MNPDR 0.613221 0.545710 0.244726 0.059072 0.053446 0.326301 0.431786 0.315049 0.414909 0.358650 0.374121 0.459916 0.483826
PDR 0.642308 0.584615 0.221154 0.130769 0.120192 0.501923 0.570192 0.508654 0.599038 0.700000 0.688462 0.725962 0.727885
SNPDR 0.691964 0.526786 0.276786 0.071429 0.075893 0.441964 0.526786 0.428571 0.482143 0.540179 0.513393 0.620536 0.602679
mNPDR 0.654501 0.609732 0.284672 0.072993 0.069586 0.306083 0.409732 0.323601 0.412165 0.347932 0.332360 0.452068 0.448662
no_DR 0.726705 0.599550 0.400533 0.111333 0.111750 0.325839 0.409943 0.329170 0.422183 0.313182 0.304522 0.453077 0.448164

Implement ANOVA test

To test if the mean values of the each feature are equal after being grouped


In [28]:
for column in quantitive_columns:
    temp = {k:list(v[column]) for k,v in person_data_new[person_data_new[column].notnull()].groupby('recent_DR')}
    print column
    print f_oneway(temp['PDR'], temp['SNPDR'], temp['MNPDR'], temp['mNPDR'], temp['no_DR'])


A1C
F_onewayResult(statistic=42.456564864690357, pvalue=4.9175526790653143e-35)
BMI
F_onewayResult(statistic=7.9206972185708171, pvalue=2.2979972724998389e-06)
Glucose
F_onewayResult(statistic=18.394389549816427, pvalue=4.5149589422910206e-15)
BP_Systolic
F_onewayResult(statistic=56.945191077184248, pvalue=9.5480751841461601e-48)
BP_Diastolic
F_onewayResult(statistic=8.6619398405301098, pvalue=5.5900999918337789e-07)
MR_OD_SPH_Numeric
F_onewayResult(statistic=2.1966630697677818, pvalue=0.066735176950483879)
MR_OD_CYL_Numeric
F_onewayResult(statistic=5.6506627213464311, pvalue=0.00015376632309987341)
MR_OS_SPH_Numeric
F_onewayResult(statistic=2.1017323745085639, pvalue=0.077839114719626457)
MR_OS_CYL_Numeric
F_onewayResult(statistic=2.7839127877884482, pvalue=0.025140550532848735)
MR_OS_DVA_ability
F_onewayResult(statistic=60.845608802329771, pvalue=6.8374998300248019e-51)
MR_OD_DVA_ability
F_onewayResult(statistic=63.284786560775842, pvalue=5.9343802959108297e-53)
MR_OS_NVA_ability
F_onewayResult(statistic=49.846849471007879, pvalue=1.5844288400301107e-41)
MR_OD_NVA_ability
F_onewayResult(statistic=44.784995411380933, pvalue=2.8365614506713311e-37)

Fill up the missing values

with the mean of the non-null variable grouped by DR diagnosis


In [29]:
DR_diagnoses = ['PDR', 'SNPDR', 'MNPDR', 'mNPDR', 'no_DR']

In [30]:
temp = person_data_new.copy()
for column in quantitive_columns:
    mean_value = temp.groupby('recent_DR').apply(lambda x: x[column][x[column].notnull()].mean())
    missing_index = temp.groupby('recent_DR').apply(lambda x: x[column][x[column].isnull()])
    for diagnosis in DR_diagnoses:
        temp.set_value(missing_index[diagnosis].index, column, mean_value[diagnosis])
person_data_fillup['recent_groupbyDR_mean'] = temp

In [31]:
temp = person_data_new.copy()
for column in quantitive_columns:
    mean_value = temp.groupby('worst_DR').apply(lambda x: x[column][x[column].notnull()].mean())
    missing_index = temp.groupby('worst_DR').apply(lambda x: x[column][x[column].isnull()])
    for diagnosis in DR_diagnoses:
        temp.set_value(missing_index[diagnosis].index, column, mean_value[diagnosis])
person_data_fillup['worst_groupbyDR_mean'] = temp

Modeling trial

Varibles


In [32]:
dummy_columns


Out[32]:
['DM',
 'ME',
 'Glaucoma_Suspect',
 'Open_angle_Glaucoma',
 'Cataract',
 'Gender_F',
 'Gender_M',
 'Gender_U',
 'Race_Asian',
 'Race_Black or African American',
 'Race_Hispanic or Latino',
 'Race_Other',
 'Race_White',
 'recent_smoking_status_current every day smoker',
 'recent_smoking_status_current some day smoker',
 'recent_smoking_status_former smoker',
 'recent_smoking_status_heavy tobacco smoker',
 'recent_smoking_status_light tobacco smoker',
 'recent_smoking_status_never smoker',
 'recent_smoking_status_smoker',
 'recent_smoking_status_unknown if ever smoked',
 'family_DM_G_DM_P_NDM',
 'family_DM_Gp_DM',
 'family_DM_Gp_SM_P_DM',
 'family_DM_P_DM',
 'family_DM_P_NDM',
 'family_DM_Unknown',
 'family_G_GP_G_P_NG',
 'family_G_Gp_G',
 'family_G_Gp_G_P_G',
 'family_G_P_G',
 'family_G_P_NG',
 'family_G_Unknown',
 'Age_group_(18, 48]',
 'Age_group_(49, 55]',
 'Age_group_(56, 60]',
 'Age_group_(61, 66]',
 'Age_group_(67, 74]',
 'Age_group_(75, 114]']

In [33]:
quantitive_columns


Out[33]:
['A1C',
 'BMI',
 'Glucose',
 'BP_Systolic',
 'BP_Diastolic',
 'MR_OD_SPH_Numeric',
 'MR_OD_CYL_Numeric',
 'MR_OS_SPH_Numeric',
 'MR_OS_CYL_Numeric',
 'MR_OS_DVA_ability',
 'MR_OD_DVA_ability',
 'MR_OS_NVA_ability',
 'MR_OD_NVA_ability']

In [34]:
target_columns = {'recent_groupbyDR_mean': 'recent_DR', 
                  'worst_groupbyDR_mean': 'worst_DR',
                  'groupbyAgegroup_mean': 'recent_DR'}

Decision Tree modeling exploration


In [35]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [36]:
for method, temp in person_data_fillup.items():
    print(method)
    
    X = temp[quantitive_columns + dummy_columns]
    y = temp[target_columns[method]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)

    preds = clf.predict(X = X_test)

    #preds = label_encoder.inverse_transform(preds.tolist())
    #y_test = label_encoder.inverse_transform(y_test)

    print(pd.crosstab(y_test, preds))
    print(metrics.classification_report(y_true = y_test, y_pred=preds))


groupbyAgegroup_mean
col_0      MNPDR  PDR  SNPDR  mNPDR  no_DR
recent_DR                                 
MNPDR         25   29      6     43    105
PDR           26   67     16     55    148
SNPDR          3   16      4     15     20
mNPDR         59   67     16    141    442
no_DR        122  153     30    486   3199
             precision    recall  f1-score   support

      MNPDR       0.11      0.12      0.11       208
        PDR       0.20      0.21      0.21       312
      SNPDR       0.06      0.07      0.06        58
      mNPDR       0.19      0.19      0.19       725
      no_DR       0.82      0.80      0.81      3990

avg / total       0.66      0.65      0.65      5293

recent_groupbyDR_mean
col_0      MNPDR  PDR  SNPDR  mNPDR  no_DR
recent_DR                                 
MNPDR        190    1      2      5     10
PDR            3  281      1      9     18
SNPDR          0    2     41      6      9
mNPDR          5    2      1    684     33
no_DR         18   17      4     37   3914
             precision    recall  f1-score   support

      MNPDR       0.88      0.91      0.90       208
        PDR       0.93      0.90      0.91       312
      SNPDR       0.84      0.71      0.77        58
      mNPDR       0.92      0.94      0.93       725
      no_DR       0.98      0.98      0.98      3990

avg / total       0.97      0.97      0.97      5293

worst_groupbyDR_mean
col_0     MNPDR  PDR  SNPDR  mNPDR  no_DR
worst_DR                                 
MNPDR       206    4      0      3     16
PDR           2  299      4      8     19
SNPDR         1    3     61      2      5
mNPDR         1    5      1    631     32
no_DR        19   18      3     32   3918
             precision    recall  f1-score   support

      MNPDR       0.90      0.90      0.90       229
        PDR       0.91      0.90      0.90       332
      SNPDR       0.88      0.85      0.87        72
      mNPDR       0.93      0.94      0.94       670
      no_DR       0.98      0.98      0.98      3990

avg / total       0.97      0.97      0.97      5293


In [41]:
tree.export_graphviz(clf, feature_names = quantitive_columns + dummy_columns,
                     class_names = ['MNPDR','PDR','SNPDR','mNPDR','no_DR'], out_file='DT.dot')

Logistic Regression modeling exploration


In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
for method, temp in person_data_fillup.items():
    print(method)
    
    X = temp[quantitive_columns + dummy_columns]
    y = temp[target_columns[method]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    clf = LogisticRegression()
    clf = clf.fit(X_train, y_train)

    preds = clf.predict(X = X_test)

    #preds = label_encoder.inverse_transform(preds.tolist())
    #y_test = label_encoder.inverse_transform(y_test)

    print(pd.crosstab(y_test, preds))
    print(metrics.classification_report(y_true = y_test, y_pred=preds))


groupbyAgegroup_mean
col_0      MNPDR  PDR  SNPDR  mNPDR  no_DR
recent_DR                                 
MNPDR          5   35      2     34    132
PDR            2   55      1     39    215
SNPDR          1   20      0      7     30
mNPDR          8   42      1     46    628
no_DR          1   22      0     16   3951
             precision    recall  f1-score   support

      MNPDR       0.29      0.02      0.04       208
        PDR       0.32      0.18      0.23       312
      SNPDR       0.00      0.00      0.00        58
      mNPDR       0.32      0.06      0.11       725
      no_DR       0.80      0.99      0.88      3990

avg / total       0.68      0.77      0.70      5293

recent_groupbyDR_mean
col_0      MNPDR  PDR  SNPDR  mNPDR  no_DR
recent_DR                                 
MNPDR         12   26      2     49    119
PDR            0   82      1     32    197
SNPDR          2   16      0     12     28
mNPDR         10   26      0     84    605
no_DR          0   41      0     61   3888
             precision    recall  f1-score   support

      MNPDR       0.50      0.06      0.10       208
        PDR       0.43      0.26      0.33       312
      SNPDR       0.00      0.00      0.00        58
      mNPDR       0.35      0.12      0.17       725
      no_DR       0.80      0.97      0.88      3990

avg / total       0.70      0.77      0.71      5293

worst_groupbyDR_mean
col_0     MNPDR  PDR  SNPDR  mNPDR  no_DR
worst_DR                                 
MNPDR        13   36      3     34    143
PDR           1  103      2     21    205
SNPDR         7   24      0     10     31
mNPDR         7   32      2     49    580
no_DR         4   45      1     48   3892
             precision    recall  f1-score   support

      MNPDR       0.41      0.06      0.10       229
        PDR       0.43      0.31      0.36       332
      SNPDR       0.00      0.00      0.00        72
      mNPDR       0.30      0.07      0.12       670
      no_DR       0.80      0.98      0.88      3990

avg / total       0.69      0.77      0.71      5293

Output the filled up data


In [39]:
temp = person_data_fillup['groupbyAgegroup_mean'][quantitive_columns + dummy_columns + ['worst_DR', 'recent_DR']]
temp.describe(include='all')


Out[39]:
A1C BMI Glucose BP_Systolic BP_Diastolic MR_OD_SPH_Numeric MR_OD_CYL_Numeric MR_OS_SPH_Numeric MR_OS_CYL_Numeric MR_OS_DVA_ability ... family_G_P_NG family_G_Unknown Age_group_(18, 48] Age_group_(49, 55] Age_group_(56, 60] Age_group_(61, 66] Age_group_(67, 74] Age_group_(75, 114] worst_DR recent_DR
count 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 ... 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039 16039
unique NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 5 5
top NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN no_DR no_DR
freq NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 12009 12009
mean 7.208732 31.768571 131.811752 133.602606 78.109306 0.281769 -0.913903 0.277343 -0.899835 21.690886 ... 0.459567 0.395785 0.179687 0.173265 0.153688 0.169774 0.158613 0.164973 NaN NaN
std 0.698271 4.489466 25.716994 16.124745 9.867409 1.479827 0.385502 1.465896 0.379826 2.592544 ... 0.498378 0.489034 0.383939 0.378488 0.360661 0.375446 0.365327 0.371168 NaN NaN
min 3.900000 14.395000 40.000000 87.000000 48.000000 -4.500000 -2.250000 -4.500000 -2.250000 15.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN
25% 6.900000 28.900000 120.000000 123.000000 72.000000 -0.500000 -1.000000 -0.500000 -0.992681 20.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN
50% 7.203035 31.579036 131.670279 133.000000 78.082412 0.459453 -0.880944 0.500000 -0.873557 20.770814 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN
75% 7.367049 33.360000 136.876498 142.000000 84.000000 0.964796 -0.750000 0.935092 -0.750000 22.500000 ... 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN
max 10.600000 50.500000 222.000000 181.000000 107.000000 5.000000 0.500000 5.000000 0.500000 32.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 NaN NaN

11 rows × 54 columns


In [40]:
#temp.to_pickle('baseline_missingHandled_Dan_20170406.pickle')
temp.to_pickle('Morefeatures_missingHandled_Dan_20170415.pickle')

In [41]:
temp = person_data_new[quantitive_columns + dummy_columns + ['worst_DR', 'recent_DR']]
temp.describe(include='all')


Out[41]:
A1C BMI Glucose BP_Systolic BP_Diastolic MR_OD_SPH_Numeric MR_OD_CYL_Numeric MR_OS_SPH_Numeric MR_OS_CYL_Numeric MR_OS_DVA_ability ... family_G_P_NG family_G_Unknown Age_group_(18, 48] Age_group_(49, 55] Age_group_(56, 60] Age_group_(61, 66] Age_group_(67, 74] Age_group_(75, 114] worst_DR recent_DR
count 4708.000000 6472.000000 10178.000000 14358.000000 14374.000000 10644.000000 9256.000000 10572.000000 9096.000000 10459.000000 ... 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039.000000 16039 16039
unique NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 5 5
top NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN no_DR no_DR
freq NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 12009 12009
mean 7.235485 31.864823 131.677803 133.611368 78.109281 0.270407 -0.909780 0.266723 -0.897068 21.520357 ... 0.459567 0.395785 0.179687 0.173265 0.153688 0.169774 0.158613 0.164973 NaN NaN
std 1.241573 6.782216 32.121431 17.023775 10.384908 1.770672 0.504352 1.760668 0.502372 3.087112 ... 0.498378 0.489034 0.383939 0.378488 0.360661 0.375446 0.365327 0.371168 NaN NaN
min 3.900000 14.395000 40.000000 87.000000 48.000000 -4.500000 -2.250000 -4.500000 -2.250000 15.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN
25% 6.300000 26.880000 109.500000 122.000000 70.675000 -0.750000 -1.250000 -0.750000 -1.250000 20.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN
50% 7.000000 31.170000 127.000000 132.000000 78.000000 0.500000 -0.750000 0.500000 -0.750000 20.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN
75% 8.000000 36.112500 150.000000 144.000000 85.000000 1.500000 -0.500000 1.500000 -0.500000 20.000000 ... 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN
max 10.600000 50.500000 222.000000 181.000000 107.000000 5.000000 0.500000 5.000000 0.500000 32.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 NaN NaN

11 rows × 54 columns


In [42]:
#temp.to_pickle('baseline_raw_Dan_20170406.pickle')
temp.to_pickle('Morefeatures_raw_Dan_20170415.pickle')

In [ ]: