In [1]:
import reader
import numpy as np
import pandas as pd
import os
First read in the original data
In [2]:
import re
data = pd.read_pickle(os.getcwd() + '/data/all_encounter_data.pickle')
repeat the processing with all_encounter_data in ICO.py
In [3]:
d_enc = data.drop(["Enc_ID","Person_ID"], axis=1)
pattern0= re.compile("\d+\s*\/\s*\d+")
index1 = d_enc['Glucose'].str.contains(pattern0, na=False)
temp = d_enc.loc[index1, 'Glucose']
d_enc.loc[index1, 'Glucose'] = d_enc.loc[index1, 'BP']
d_enc.loc[index1, 'BP'] = temp
index2 = d_enc.BP[d_enc.BP.notnull()][~d_enc.BP[d_enc.BP.notnull()].str.contains('/')].index
temp = d_enc.loc[index2, 'Glucose']
d_enc.loc[index2, 'Glucose'] = d_enc.loc[index2, 'BP']
d_enc.loc[index2, 'BP'] = temp
# Split up the BP field into Systolic and Diastolic readings
pattern1 = re.compile("(?P<BP_Systolic>\d+)\s*\/\s*(?P<BP_Diastolic>\d+)")
d_enc = pd.merge(d_enc, d_enc["BP"].str.extract(pattern1, expand=True),
left_index=True, right_index=True).drop("BP", axis=1)
# Define ranges for reasonable values. Identify the data outside of 1.5 times of IQR as outliers
NaN = float("NaN")
quantitive_columns=['A1C', 'BMI', 'Glucose', 'BP_Diastolic', 'BP_Systolic']
In [4]:
for column in quantitive_columns:
d_enc[column] = pd.to_numeric(d_enc[column], errors='coerce')
temp = d_enc[column][d_enc[column].notnull()]
Q2 = temp.quantile(0.75)
Q1 = temp.quantile(0.25)
IQR = Q2-Q1
print(temp[Q1 - 2 * IQR < temp][temp[Q1 - 2 * IQR < temp] < Q2 + 2 * IQR].shape[0], temp.shape[0], d_enc.shape[0])
print(column, Q1 - 2 * IQR, Q2 + 2 * IQR)
In [5]:
for column in quantitive_columns:
d_enc[column] = pd.to_numeric(d_enc[column], errors='coerce')
temp = d_enc[column][d_enc[column].notnull()]
Q2 = temp.quantile(0.75)
Q1 = temp.quantile(0.25)
IQR = Q2-Q1
print(temp[Q1 - 1.5 * IQR < temp][temp[Q1 - 1.5 * IQR < temp] < Q2 + 1.5 * IQR].shape[0], temp.shape[0], d_enc.shape[0])
print(column, Q1 - 1.5 * IQR, Q2 + 1.5 * IQR)
After setting up the standard range of outliers, we lost at most 600 points for each variable in all_encounter_data. (And set the times of IQR from 1.5 to 2 does not really remain as many points as I was expecting. So I choose the standard times of 1.5)
And after removing the outliers, the variables seem a lot more normal distributed.
In [6]:
import matplotlib.pyplot as plt
for column in quantitive_columns:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
temp0 = pd.to_numeric(d_enc[column], errors='coerce')
ax1.hist(temp0[temp0.notnull()])
ax1.set_title('before')
temp = temp0[temp0.notnull()]
Q2 = temp.quantile(0.75)
Q1 = temp.quantile(0.25)
IQR = Q2-Q1
d_enc[column] = temp0.map(lambda x: x if Q1 - 1.5 * IQR < x < Q2 + 1.5 * IQR else NaN)
ax2.hist(d_enc[column][d_enc[column].notnull()])
ax2.set_title('after')
f.suptitle(column)
plt.show()
What about we crush all_encounter_data to all_person_data by group Person_Nbr?
In [7]:
person_data_old = pd.read_pickle(os.getcwd() + '/data/all_person_data_Richard_20170307.pickle')
person_data_new = pd.read_pickle(os.getcwd() + '/data/all_person_data_Dan_20170406.pickle')
In [8]:
person_data_old[quantitive_columns].isnull().sum(axis=0)/person_data_old.shape[0]
Out[8]:
In [9]:
person_data_new[quantitive_columns].isnull().sum(axis=0)/person_data_new.shape[0]
Out[9]:
From above we can tell, after I identify the outliers, no more than 4% points of each variable are removed, which is acceptable for me.
In [10]:
plt.bar(range(0,5),
person_data_new[quantitive_columns].isnull().sum(axis=0)/person_data_new.shape[0])
plt.gca().set_ylim([0,1])
plt.xticks(range(0,5), quantitive_columns)
plt.ylabel('Missing value percentage')
plt.xlabel('Quantative variables')
plt.show()
In [3]:
person_data_new = pd.read_pickle(os.getcwd() + '/data/all_person_data_Dan_20170415.pickle')
In [5]:
person_data_new.columns.values
Out[5]:
In [17]:
quantitive_columns = ["A1C", "BMI", "Glucose", "BP_Systolic", "BP_Diastolic",
'MR_OD_SPH_Numeric', 'MR_OD_CYL_Numeric',
'MR_OS_SPH_Numeric', 'MR_OS_CYL_Numeric',
'MR_OS_DVA_ability', 'MR_OD_DVA_ability',
'MR_OS_NVA_ability', 'MR_OD_NVA_ability']
In [9]:
dummy_columns = ['DM', 'ME', 'Glaucoma_Suspect', 'Open_angle_Glaucoma', 'Cataract']
categorical_columns = ['Gender', 'Race', 'recent_smoking_status', 'family_DM', 'family_G']
for column in categorical_columns:
temp = pd.get_dummies(person_data_new[column], prefix=column)
person_data_new[temp.columns.values]=temp
dummy_columns.extend(temp.columns.values.tolist())
In [12]:
temp = person_data_new.copy()
mean_value = temp.groupby(['Gender', pd.cut(temp['Age'], 6)]).apply(
lambda x: x['A1C'][x['A1C'].notnull()].mean())
missing_index = temp.groupby(['Gender', pd.cut(temp['Age'], 6)]).apply(
lambda x: x['A1C'][x['A1C'].isnull()])
for i in mean_value.index.to_series().tolist():
if i in missing_index.index:
temp.set_value(missing_index[i].index, 'A1C', mean_value[i])
In [13]:
mean_value
Out[13]:
In [14]:
temp[temp['A1C'].isnull()].shape[0]
Out[14]:
In [10]:
age_group = np.array([person_data_new.Age.quantile(1.0/6*i) for i in range(1,7)])
In [11]:
age_group
Out[11]:
In [12]:
person_data_new['Age_group_numeric']=person_data_new.Age.apply(lambda x: sum(age_group<x)+1)
In [13]:
age_group_dict = {1: '(18, 48]', 2: '(49, 55]', 3: '(56, 60]', 4: '(61, 66]', 5: '(67, 74]', 6: '(75, 114]'}
person_data_new['Age_group'] = person_data_new.Age_group_numeric.apply(lambda x: age_group_dict.get(x))
In [14]:
person_data_new.groupby('Age_group').apply(lambda x: x.shape[0])
Out[14]:
In [15]:
temp = pd.get_dummies(person_data_new['Age_group'], prefix = 'Age_group')
person_data_new[temp.columns.values] = temp
dummy_columns.extend(temp.columns.values.tolist())
In [18]:
person_data_new.groupby('Age_group').apply(lambda x: x[quantitive_columns].isnull().sum(axis=0)/x.shape[0])
Out[18]:
In [19]:
from scipy.stats import f_oneway
In [20]:
for column in quantitive_columns:
temp = {k:list(v[column]) for k,v in person_data_new[person_data_new[column].notnull()].groupby('Age_group_numeric')}
print column
print f_oneway(temp[1], temp[2], temp[3], temp[4], temp[5], temp[6])
In [21]:
for column in quantitive_columns:
temp = {k:list(v[column]) for k,v in person_data_new[person_data_new[column].notnull()].groupby('Gender')}
print column
print f_oneway(temp['F'], temp['M'])
In [22]:
person_data_fillup = {}
In [23]:
temp = person_data_new.copy()
for column in quantitive_columns:
mean_value = temp.groupby('Age_group').apply(
lambda x: x[column][x[column].notnull()].mean())
missing_index = temp.groupby('Age_group').apply(
lambda x: x[column][x[column].isnull()])
for i in mean_value.index.to_series().tolist():
if i in missing_index.index:
temp.set_value(missing_index[i].index, column, mean_value[i])
person_data_fillup['groupbyAgegroup_mean'] = temp
In [24]:
person_data_new.groupby('recent_DR').apply(lambda x: x.shape[0])
Out[24]:
In [25]:
person_data_new.groupby('recent_DR').apply(lambda x: x[quantitive_columns].isnull().sum(axis=0)/x.shape[0])
Out[25]:
In [26]:
person_data_new.groupby('worst_DR').apply(lambda x: x.shape[0])
Out[26]:
In [27]:
person_data_new.groupby('worst_DR').apply(lambda x: x[quantitive_columns].isnull().sum(axis=0)/x.shape[0])
Out[27]:
In [28]:
for column in quantitive_columns:
temp = {k:list(v[column]) for k,v in person_data_new[person_data_new[column].notnull()].groupby('recent_DR')}
print column
print f_oneway(temp['PDR'], temp['SNPDR'], temp['MNPDR'], temp['mNPDR'], temp['no_DR'])
In [29]:
DR_diagnoses = ['PDR', 'SNPDR', 'MNPDR', 'mNPDR', 'no_DR']
In [30]:
temp = person_data_new.copy()
for column in quantitive_columns:
mean_value = temp.groupby('recent_DR').apply(lambda x: x[column][x[column].notnull()].mean())
missing_index = temp.groupby('recent_DR').apply(lambda x: x[column][x[column].isnull()])
for diagnosis in DR_diagnoses:
temp.set_value(missing_index[diagnosis].index, column, mean_value[diagnosis])
person_data_fillup['recent_groupbyDR_mean'] = temp
In [31]:
temp = person_data_new.copy()
for column in quantitive_columns:
mean_value = temp.groupby('worst_DR').apply(lambda x: x[column][x[column].notnull()].mean())
missing_index = temp.groupby('worst_DR').apply(lambda x: x[column][x[column].isnull()])
for diagnosis in DR_diagnoses:
temp.set_value(missing_index[diagnosis].index, column, mean_value[diagnosis])
person_data_fillup['worst_groupbyDR_mean'] = temp
In [32]:
dummy_columns
Out[32]:
In [33]:
quantitive_columns
Out[33]:
In [34]:
target_columns = {'recent_groupbyDR_mean': 'recent_DR',
'worst_groupbyDR_mean': 'worst_DR',
'groupbyAgegroup_mean': 'recent_DR'}
In [35]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
In [36]:
for method, temp in person_data_fillup.items():
print(method)
X = temp[quantitive_columns + dummy_columns]
y = temp[target_columns[method]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
preds = clf.predict(X = X_test)
#preds = label_encoder.inverse_transform(preds.tolist())
#y_test = label_encoder.inverse_transform(y_test)
print(pd.crosstab(y_test, preds))
print(metrics.classification_report(y_true = y_test, y_pred=preds))
In [41]:
tree.export_graphviz(clf, feature_names = quantitive_columns + dummy_columns,
class_names = ['MNPDR','PDR','SNPDR','mNPDR','no_DR'], out_file='DT.dot')
In [37]:
from sklearn.linear_model import LogisticRegression
In [38]:
for method, temp in person_data_fillup.items():
print(method)
X = temp[quantitive_columns + dummy_columns]
y = temp[target_columns[method]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = LogisticRegression()
clf = clf.fit(X_train, y_train)
preds = clf.predict(X = X_test)
#preds = label_encoder.inverse_transform(preds.tolist())
#y_test = label_encoder.inverse_transform(y_test)
print(pd.crosstab(y_test, preds))
print(metrics.classification_report(y_true = y_test, y_pred=preds))
In [39]:
temp = person_data_fillup['groupbyAgegroup_mean'][quantitive_columns + dummy_columns + ['worst_DR', 'recent_DR']]
temp.describe(include='all')
Out[39]:
In [40]:
#temp.to_pickle('baseline_missingHandled_Dan_20170406.pickle')
temp.to_pickle('Morefeatures_missingHandled_Dan_20170415.pickle')
In [41]:
temp = person_data_new[quantitive_columns + dummy_columns + ['worst_DR', 'recent_DR']]
temp.describe(include='all')
Out[41]:
In [42]:
#temp.to_pickle('baseline_raw_Dan_20170406.pickle')
temp.to_pickle('Morefeatures_raw_Dan_20170415.pickle')
In [ ]: