In [1]:
import pandas as pd
from pprint import pprint
import json
import numpy as np
In [2]:
path = 'E:\\anil\\IIT Sop\\Term02\\MATH497\\ICO_data\\original_pickle\\'
In [3]:
# ICD_list table must be re-built from, presumably, ICD_for_Enc due to some entries being
# pre-18th birthday. ICD_list entries are not timestamped!
table_names = ['all_encounter_data', 'demographics', 'encounters', 'family_hist_for_Enc',
'family_hist_list', 'ICD_for_Enc', 'ICD_list', 'macula_findings_for_Enc',
'SL_Lens_for_Enc', 'SNOMED_problem_list', 'systemic_disease_for_Enc', 'systemic_disease_list']
person_data = ['demographics','family_hist_list', 'systemic_disease_list', 'SNOMED_problem_list']
encounter_data = ['all_encounter_data', 'encounters', 'family_hist_for_Enc', 'ICD_for_Enc', 'macula_findings_for_Enc',
'SL_Lens_for_Enc', 'systemic_disease_for_Enc']
In [4]:
# read tables into dataframes
dfs = [ pd.read_pickle(path + name + '.pickle') if name != 'ICD_list' else None
for name in table_names ]
# rename columns in all dataframes to avoid unicode decode error
for df in dfs:
if df is not None:
df.columns = [col.decode("utf-8-sig") for col in df.columns]
In [5]:
# aggregate encounter nbrs under person number from tables with encounter numbers
encounter_key = 'Enc_Nbr'
person_key = 'Person_Nbr'
encounters_by_person = {}
for df in dfs:
if df is not None:
df_columns =set(df.columns.values)
if encounter_key in df_columns and person_key in df_columns:
for row_index, dfrow in df.iterrows():
rowdict = dict(dfrow)
person_nbr = rowdict[person_key]
encounter_nbr = rowdict[encounter_key]
encounters_by_person.setdefault(person_nbr, set()).add(encounter_nbr)
In [7]:
encounter_nbr_to_person_nbr = {}
for person_nbr, encounter_nbrs in encounters_by_person.iteritems():
for encounter_nbr in encounter_nbrs:
encounter_nbr_to_person_nbr.setdefault(encounter_nbr, set()).add(person_nbr)
for encounter_nbr, person_nbrs in encounter_nbr_to_person_nbr.iteritems():
if len(person_nbrs)>1:
print 'Encounter {} mapped to multiple persons {}'.format(encounter_nbr, str(list(person_nbrs)))
In [9]:
person_key = 'Person_Nbr'
encounter_key = 'Enc_Nbr'
# columns_to_ignore = [u'Person_ID', u'Person_Nbr', u'Enc_ID', u'Enc_Nbr', u'Enc_Date']
data_by_persons = {}
data_by_persons_type = {}
for df_index, df in enumerate(dfs):
df_name = table_names[df_index]
data_by_persons[df_name] = {}
if df is not None:
df_columns =set(df.columns.values)
if person_key in df_columns and encounter_key not in df_columns:
print 'Processing dataframe: {}'.format(df_name)
# check if encounter is primary key in the table
if len(df) == len(df[person_key].unique()):
data_by_persons_type[df_name] = 'single'
for row_index, dfrow in df.iterrows():
rowdict = dict(dfrow)
for k, v in rowdict.iteritems():
if isinstance(v, pd.tslib.Timestamp):
rowdict[k] = v.toordinal()
person_nbr = rowdict[person_key]
data_by_persons[df_name][person_nbr] = rowdict
else:
data_by_persons_type[df_name] = 'list'
for row_index, dfrow in df.iterrows():
rowdict = dict(dfrow)
for k, v in rowdict.iteritems():
if isinstance(v, pd.tslib.Timestamp):
rowdict[k] = v.toordinal()
person_nbr = rowdict[person_key]
data_by_persons[df_name].setdefault(person_nbr, []).append(rowdict)
In [10]:
encounter_key = 'Enc_Nbr'
# columns_to_ignore = [u'Person_ID', u'Person_Nbr', u'Enc_ID', u'Enc_Nbr', u'Enc_Date']
data_by_encounters = {}
data_by_encounters_type = {}
for df_index, df in enumerate(dfs):
df_name = table_names[df_index]
data_by_encounters[df_name] = {}
if df is not None:
df_columns =set(df.columns.values)
if encounter_key in df_columns:
print 'Processing dataframe: {}'.format(df_name)
# check if encounter is primary key in the table
if len(df) == len(df[encounter_key].unique()):
data_by_encounters_type[df_name] = 'single'
for row_index, dfrow in df.iterrows():
rowdict = dict(dfrow)
for k, v in rowdict.iteritems():
if isinstance(v, pd.tslib.Timestamp):
rowdict[k] = v.toordinal()
encounter_nbr = rowdict[encounter_key]
data_by_encounters[df_name][encounter_nbr] = rowdict
else:
data_by_encounters_type[df_name] = 'list'
for row_index, dfrow in df.iterrows():
rowdict = dict(dfrow)
for k, v in rowdict.iteritems():
if isinstance(v, pd.tslib.Timestamp):
rowdict[k] = v.toordinal()
encounter_nbr = rowdict[encounter_key]
data_by_encounters[df_name].setdefault(encounter_nbr, []).append(rowdict)
In [11]:
all_persons = []
for person_nbr in encounters_by_person:
person_object = {person_key:person_nbr, 'encounters':[]}
# insert all person related info in person_object
for df_name in data_by_persons_type:
if data_by_persons_type[df_name] =="single":
if person_nbr in data_by_persons[df_name]:
person_single_data_row = data_by_persons[df_name][person_nbr]
for key, value in person_single_data_row.iteritems():
person_object[key] = value
else:
if person_nbr in data_by_persons[df_name]:
person_multiple_data_row = data_by_persons[df_name][person_nbr]
person_object[df_name] = person_multiple_data_row
person_object[df_name+"_count"] = len(person_multiple_data_row)
# insert all encounter related information as a an encounter object into the person object
for enc_nbr in encounters_by_person[person_nbr]:
encounter_object = {encounter_key: enc_nbr}
for df_name in data_by_encounters_type:
if data_by_encounters_type[df_name] =="single":
if enc_nbr in data_by_encounters[df_name]:
encounter_single_data_row = data_by_encounters[df_name][enc_nbr]
for key, value in encounter_single_data_row.iteritems():
encounter_object[key] = value
else:
if enc_nbr in data_by_encounters[df_name]:
encounter_multiple_data_row = data_by_encounters[df_name][enc_nbr]
encounter_object[df_name] = encounter_multiple_data_row
encounter_object[df_name+"_count"] = len(encounter_multiple_data_row)
person_object['encounters'].append(encounter_object)
all_persons.append(person_object)
In [12]:
with open('20170226_ICO_data_combined_before_duplicate_fields_drop.json', 'w') as fh:
json.dump(all_persons, fh)
In [13]:
# drop repeated columns in nested fields
columns_to_drop = ['Enc_ID', 'Enc_Nbr', 'Enc_Date', 'Person_ID', 'Person_Nbr','Date_Created', 'Enc_Timestamp']
for person_index in range(len(all_persons)):
for df_name in data_by_persons_type:
if data_by_persons_type[df_name] != "single":
if df_name in all_persons[person_index] and df_name!='encounters':
for rp_index in range(len(all_persons[person_index][df_name])):
for column_to_drop in columns_to_drop:
try:
del all_persons[person_index][df_name][rp_index][column_to_drop]
except:
pass
for enc_obj_index in range(len(all_persons[person_index]['encounters'])):
enc_obj = all_persons[person_index]['encounters'][enc_obj_index]
for df_name in data_by_encounters_type:
if data_by_encounters_type[df_name] != "single":
if df_name in enc_obj:
for rp_index in range(len(enc_obj[df_name])):
for column_to_drop in columns_to_drop:
try:
del enc_obj[df_name][rp_index][column_to_drop]
except:
pass
all_persons[person_index]['encounters'][enc_obj_index] = enc_obj
In [14]:
# drop full na object rows for list properties
for person_index in range(len(all_persons)):
for df_name in data_by_persons_type:
if data_by_persons_type[df_name] != "single":
if df_name in all_persons[person_index] and df_name!='encounters':
for rp_index in reversed(range(len(all_persons[person_index][df_name]))):
if all(pd.isnull(all_persons[person_index][df_name][rp_index].values())):
del all_persons[person_index][df_name][rp_index]
for enc_obj_index in range(len(all_persons[person_index]['encounters'])):
enc_obj = all_persons[person_index]['encounters'][enc_obj_index]
for df_name in data_by_encounters_type:
if data_by_encounters_type[df_name] != "single":
if df_name in enc_obj:
for rp_index in reversed(range(len(enc_obj[df_name]))):
if all(pd.isnull(enc_obj[df_name][rp_index].values())):
del enc_obj[df_name][rp_index]
all_persons[person_index]['encounters'][enc_obj_index] = enc_obj
In [15]:
# update counts for list properties
for person_index in range(len(all_persons)):
for df_name in data_by_persons_type:
if data_by_persons_type[df_name] != "single":
if df_name in all_persons[person_index] and df_name!='encounters':
all_persons[person_index][df_name+"_count"] = len(all_persons[person_index][df_name])
for enc_obj_index in range(len(all_persons[person_index]['encounters'])):
enc_obj = all_persons[person_index]['encounters'][enc_obj_index]
for df_name in data_by_encounters_type:
if data_by_encounters_type[df_name] != "single":
if df_name in enc_obj:
enc_obj[df_name+"_count"] = len(enc_obj[df_name])
all_persons[person_index]['encounters'][enc_obj_index] = enc_obj
In [16]:
with open('20170226_ICO_data_after_duplicate_fields_drop.json', 'w') as fh:
json.dump(all_persons, fh)
In [21]:
# creating a dataframe from aggregated data
combined_data_df = pd.DataFrame.from_records(all_persons)
combined_data_df.set_index(['Person_Nbr'], inplace=True)
In [23]:
# TODO Similar functions for processing nested/list data has to be written
# add std race column
def standardize_race(race):
standard_race_conversion_dict = {'African American':'Black or African American',
'Black or African American':'Black or African American',
'Black/African American (Not Hispanic)':'Black or African American',
'American Indian or Alaska Native':'American Indian or Alaska Native',
'American Indian/Alaskan Native':'American Indian or Alaska Native',
'American Indian':'American Indian or Alaska Native',
'Native American Indian':'American Indian or Alaska Native',
'Alaskan Native':'American Indian or Alaska Native',
'Asian':'Asian','Chinese':'Asian','Indian':'Asian','Caucasian':'White',
'White (Not Hispanic / Latino)':'White, not Hispanic or Latino',
'White':'White','Declined to specify':'Unknown',
'Unknown/Not Reported':'Unknown','Greek':'White',
'Native Hawaiian or Other Pacific Islander':'Native Hawaiian and Other Pacific Islander',
'Hawaiian':'Native Hawaiian and Other Pacific Islander',
'Other Pacific Islander (Not Hawaiian)':'Native Hawaiian and Other Pacific Islander',
'Hispanic Or Latino (All Races)':'Hispanic or Latino','Hispanic':'Hispanic or Latino',
'More than one race':'Two or More Races','Multiracial':'Two or More Races',
'Multi-racial':'Two or More Races','Moroccan':'White',float('nan'):'Unknown',
'Other Race':'Other Race','Other Race (Jamaican)':'Other Race'
}
if race in standard_race_conversion_dict:
return standard_race_conversion_dict[race]
return 'Unknown'
combined_data_df['Race_Std'] = combined_data_df['Race'].apply(standardize_race)
In [26]:
combined_data_df.head()
Out[26]:
In [29]:
# to access encounters for a given Person_Nbr
# these objects have nested encounter related info like all_encounter_data, systemic_disease_for_Enc, family_hist_for_Enc
combined_data_df.loc[109227, 'encounters']
Out[29]:
In [25]:
combined_data_df.to_pickle(path+'20170226_ICO_Data_combined.pickle')