In [1]:
import pandas as pd
from pprint import pprint
import json
import numpy as np

In [2]:
path = 'E:\\anil\\IIT Sop\\Term02\\MATH497\\ICO_data\\original_pickle\\'

In [3]:
# ICD_list table must be re-built from, presumably, ICD_for_Enc due to some entries being
# pre-18th birthday.  ICD_list entries are not timestamped!
table_names = ['all_encounter_data', 'demographics', 'encounters', 'family_hist_for_Enc',
               'family_hist_list', 'ICD_for_Enc', 'ICD_list', 'macula_findings_for_Enc',
               'SL_Lens_for_Enc', 'SNOMED_problem_list', 'systemic_disease_for_Enc', 'systemic_disease_list']

person_data = ['demographics','family_hist_list', 'systemic_disease_list', 'SNOMED_problem_list']

encounter_data = ['all_encounter_data', 'encounters', 'family_hist_for_Enc', 'ICD_for_Enc', 'macula_findings_for_Enc',
                   'SL_Lens_for_Enc', 'systemic_disease_for_Enc']

In [4]:
# read tables into dataframes
dfs = [ pd.read_pickle(path + name + '.pickle') if name != 'ICD_list' else None
        for name in table_names ]

# rename columns in all dataframes to avoid unicode decode error
for df in dfs:
    if df is not None:
        df.columns = [col.decode("utf-8-sig") for col in df.columns]

Grouping all encounter nbrs under respective person nbr


In [5]:
# aggregate encounter nbrs under person number from tables with encounter numbers
encounter_key = 'Enc_Nbr'
person_key = 'Person_Nbr'
encounters_by_person = {}
for df in dfs:
    if df is not None:
        df_columns =set(df.columns.values)
        if encounter_key in df_columns and person_key in df_columns:
            for row_index, dfrow in df.iterrows():
                rowdict = dict(dfrow)
                person_nbr = rowdict[person_key]
                encounter_nbr = rowdict[encounter_key]
                encounters_by_person.setdefault(person_nbr, set()).add(encounter_nbr)

In [7]:
encounter_nbr_to_person_nbr = {}
for person_nbr, encounter_nbrs in  encounters_by_person.iteritems():
    for encounter_nbr in encounter_nbrs:
        encounter_nbr_to_person_nbr.setdefault(encounter_nbr, set()).add(person_nbr)

for encounter_nbr, person_nbrs in encounter_nbr_to_person_nbr.iteritems():
    if len(person_nbrs)>1:
        print 'Encounter {} mapped to multiple persons {}'.format(encounter_nbr, str(list(person_nbrs)))

In [9]:
person_key = 'Person_Nbr'
encounter_key = 'Enc_Nbr'
# columns_to_ignore = [u'Person_ID', u'Person_Nbr', u'Enc_ID', u'Enc_Nbr', u'Enc_Date']
data_by_persons = {}
data_by_persons_type = {}
for df_index, df in enumerate(dfs):
    df_name = table_names[df_index]
    data_by_persons[df_name] = {}
    if df is not None:
        df_columns =set(df.columns.values)
        if person_key in df_columns and encounter_key not in df_columns:
            print 'Processing dataframe: {}'.format(df_name)
            # check if encounter is primary key in the table
            if len(df) == len(df[person_key].unique()):
                data_by_persons_type[df_name] = 'single'
                for row_index, dfrow in df.iterrows():
                    rowdict = dict(dfrow)
                    
                    for k, v in rowdict.iteritems():
                        if isinstance(v, pd.tslib.Timestamp):
                            rowdict[k] = v.toordinal()
                    
                    person_nbr = rowdict[person_key]
                    data_by_persons[df_name][person_nbr] = rowdict
            else:
                data_by_persons_type[df_name] = 'list'
                for row_index, dfrow in df.iterrows():
                    rowdict = dict(dfrow)
                    for k, v in rowdict.iteritems():
                        if isinstance(v, pd.tslib.Timestamp):
                            rowdict[k] = v.toordinal()
                    person_nbr = rowdict[person_key]
                    data_by_persons[df_name].setdefault(person_nbr, []).append(rowdict)


Processing dataframe: demographics
Processing dataframe: family_hist_list
Processing dataframe: SNOMED_problem_list
Processing dataframe: systemic_disease_list

In [10]:
encounter_key = 'Enc_Nbr'
# columns_to_ignore = [u'Person_ID', u'Person_Nbr', u'Enc_ID', u'Enc_Nbr', u'Enc_Date']
data_by_encounters = {}
data_by_encounters_type = {}
for df_index, df in enumerate(dfs):
    df_name = table_names[df_index]
    
    data_by_encounters[df_name] = {}
    if df is not None:
        df_columns =set(df.columns.values)
        if encounter_key in df_columns:
            print 'Processing dataframe: {}'.format(df_name)
            # check if encounter is primary key in the table
            if len(df) == len(df[encounter_key].unique()):
                data_by_encounters_type[df_name] = 'single'
                for row_index, dfrow in df.iterrows():
                    rowdict = dict(dfrow)
                    
                    for k, v in rowdict.iteritems():
                        if isinstance(v, pd.tslib.Timestamp):
                            rowdict[k] = v.toordinal()
                    
                    encounter_nbr = rowdict[encounter_key]
                    data_by_encounters[df_name][encounter_nbr] = rowdict
            else:
                data_by_encounters_type[df_name] = 'list'
                for row_index, dfrow in df.iterrows():
                    rowdict = dict(dfrow)
                    for k, v in rowdict.iteritems():
                        if isinstance(v, pd.tslib.Timestamp):
                            rowdict[k] = v.toordinal()
                    encounter_nbr = rowdict[encounter_key]
                    data_by_encounters[df_name].setdefault(encounter_nbr, []).append(rowdict)


Processing dataframe: all_encounter_data
Processing dataframe: encounters
Processing dataframe: family_hist_for_Enc
Processing dataframe: ICD_for_Enc
Processing dataframe: macula_findings_for_Enc
Processing dataframe: SL_Lens_for_Enc
Processing dataframe: systemic_disease_for_Enc

Aggregating encounter entities under respective person entity


In [11]:
all_persons = []
for person_nbr in encounters_by_person:
    person_object = {person_key:person_nbr, 'encounters':[]}
    
    # insert all person related info in person_object
    for df_name in data_by_persons_type:
        if data_by_persons_type[df_name] =="single":
            if person_nbr in data_by_persons[df_name]:
                person_single_data_row = data_by_persons[df_name][person_nbr]
                for key, value in person_single_data_row.iteritems():
                    person_object[key] = value
        else:
            if person_nbr in data_by_persons[df_name]:
                person_multiple_data_row = data_by_persons[df_name][person_nbr]
                person_object[df_name] = person_multiple_data_row 
                person_object[df_name+"_count"] = len(person_multiple_data_row)
                
                
    # insert all encounter related information as a an encounter object into the person object                   
    for enc_nbr in encounters_by_person[person_nbr]:
        encounter_object = {encounter_key: enc_nbr}
        for df_name in data_by_encounters_type:
            if data_by_encounters_type[df_name] =="single":
                if enc_nbr in data_by_encounters[df_name]:
                    encounter_single_data_row = data_by_encounters[df_name][enc_nbr]
                    for key, value in encounter_single_data_row.iteritems():
                        encounter_object[key] = value
            else:
                if enc_nbr in data_by_encounters[df_name]:
                    encounter_multiple_data_row = data_by_encounters[df_name][enc_nbr]
                    encounter_object[df_name] = encounter_multiple_data_row
                    encounter_object[df_name+"_count"] = len(encounter_multiple_data_row)

                    
        person_object['encounters'].append(encounter_object)

    all_persons.append(person_object)

Dropping duplicated columns and then full na rows across tables


In [12]:
with open('20170226_ICO_data_combined_before_duplicate_fields_drop.json', 'w') as fh:
    json.dump(all_persons, fh)

In [13]:
# drop repeated columns in nested fields


columns_to_drop = ['Enc_ID', 'Enc_Nbr', 'Enc_Date', 'Person_ID', 'Person_Nbr','Date_Created', 'Enc_Timestamp']


for person_index in range(len(all_persons)):
    
    for df_name in data_by_persons_type:
        if data_by_persons_type[df_name] != "single":
            if df_name in all_persons[person_index] and df_name!='encounters':
                for rp_index in range(len(all_persons[person_index][df_name])):
                    for column_to_drop in columns_to_drop:
                        try:
                            del all_persons[person_index][df_name][rp_index][column_to_drop]
                        except:
                            pass

        
            
    
    for enc_obj_index in range(len(all_persons[person_index]['encounters'])):
    
        enc_obj = all_persons[person_index]['encounters'][enc_obj_index]
    
        for df_name in data_by_encounters_type:
            if data_by_encounters_type[df_name] != "single":
                if df_name in enc_obj:
                    for rp_index in range(len(enc_obj[df_name])):
                        for column_to_drop in columns_to_drop:
                            try:
                                del enc_obj[df_name][rp_index][column_to_drop]
                            except:
                                pass
                    
            
        all_persons[person_index]['encounters'][enc_obj_index] = enc_obj

In [14]:
# drop full na object rows for list properties
for person_index in range(len(all_persons)):
    for df_name in data_by_persons_type:
        if data_by_persons_type[df_name] != "single":
            if df_name in all_persons[person_index] and df_name!='encounters':
                for rp_index in reversed(range(len(all_persons[person_index][df_name]))):
                    if all(pd.isnull(all_persons[person_index][df_name][rp_index].values())):
                        del all_persons[person_index][df_name][rp_index]
                
    for enc_obj_index in range(len(all_persons[person_index]['encounters'])):
        enc_obj = all_persons[person_index]['encounters'][enc_obj_index]
        for df_name in data_by_encounters_type:
            if data_by_encounters_type[df_name] != "single":
                if df_name in enc_obj:
                    for rp_index in reversed(range(len(enc_obj[df_name]))):
                        if all(pd.isnull(enc_obj[df_name][rp_index].values())):
                            del enc_obj[df_name][rp_index]
        
        all_persons[person_index]['encounters'][enc_obj_index] = enc_obj

In [15]:
# update counts for list properties
for person_index in range(len(all_persons)):
    for df_name in data_by_persons_type:
        if data_by_persons_type[df_name] != "single":
            if df_name in all_persons[person_index] and df_name!='encounters':
                all_persons[person_index][df_name+"_count"] = len(all_persons[person_index][df_name])
                
                    
                        
                
    for enc_obj_index in range(len(all_persons[person_index]['encounters'])):
        enc_obj = all_persons[person_index]['encounters'][enc_obj_index]
        for df_name in data_by_encounters_type:
            if data_by_encounters_type[df_name] != "single":
                if df_name in enc_obj:
                    enc_obj[df_name+"_count"] = len(enc_obj[df_name])
        all_persons[person_index]['encounters'][enc_obj_index] = enc_obj

In [16]:
with open('20170226_ICO_data_after_duplicate_fields_drop.json', 'w') as fh:
    json.dump(all_persons, fh)

In [21]:
# creating a dataframe from aggregated data
combined_data_df = pd.DataFrame.from_records(all_persons)
combined_data_df.set_index(['Person_Nbr'], inplace=True)

In [23]:
# TODO Similar functions for processing nested/list data has to be written
# add std race column
def standardize_race(race):
    standard_race_conversion_dict = {'African American':'Black or African American',
     'Black or African American':'Black or African American',
     'Black/African American (Not Hispanic)':'Black or African American',
     'American Indian or Alaska Native':'American Indian or Alaska Native',
     'American Indian/Alaskan Native':'American Indian or Alaska Native',
     'American Indian':'American Indian or Alaska Native',
     'Native American Indian':'American Indian or Alaska Native',
     'Alaskan Native':'American Indian or Alaska Native',
     'Asian':'Asian','Chinese':'Asian','Indian':'Asian','Caucasian':'White',
     'White (Not Hispanic / Latino)':'White, not Hispanic or Latino',
     'White':'White','Declined to specify':'Unknown',
     'Unknown/Not Reported':'Unknown','Greek':'White',
     'Native Hawaiian or Other Pacific Islander':'Native Hawaiian and Other Pacific Islander',
     'Hawaiian':'Native Hawaiian and Other Pacific Islander',
     'Other Pacific Islander (Not Hawaiian)':'Native Hawaiian and Other Pacific Islander',
     'Hispanic Or Latino (All Races)':'Hispanic or Latino','Hispanic':'Hispanic or Latino',
     'More than one race':'Two or More Races','Multiracial':'Two or More Races',
     'Multi-racial':'Two or More Races','Moroccan':'White',float('nan'):'Unknown',
     'Other Race':'Other Race','Other Race (Jamaican)':'Other Race'
    }
    
    if race in standard_race_conversion_dict:
        return standard_race_conversion_dict[race]
    return 'Unknown'
    
combined_data_df['Race_Std'] = combined_data_df['Race'].apply(standardize_race)

In [26]:
combined_data_df.head()


Out[26]:
Age_Censored DOB Ethnicity Gender Person_ID Race SNOMED_problem_list SNOMED_problem_list_count Zip encounters family_hist_list family_hist_list_count systemic_disease_list systemic_disease_list_count Race_Std
Person_Nbr
109227 None 714287 Not Hispanic or Latino F E6121EF6-660A-4A34-B4B4-ECB513E3C916 Black/African American (Not Hispanic) [{u'Concept_ID': 38101003, u'Description': u'H... 3.0 60419 [{u'OD_Macula2_Modifer': nan, u'OS_SL_LENS': n... [{u'Code': u'416855002', u'Family_History': u'... 6.0 [{u'Systemic_Diseases': u'Diabetes type 2', u'... 5.0 Black or African American
622597 None 716389 Declined to specify F E50C7786-EA20-41F9-8308-C5590839B518 Declined to specify [{u'Concept_ID': 46152009, u'Description': u'T... 4.0 604261306 [{u'OD_Macula2_Modifer': nan, u'OS_SL_LENS': u... [{u'Code': u'416855002', u'Family_History': u'... 5.0 [{u'Systemic_Diseases': u'Diabetes', u'Snomed_... 7.0 Unknown
196620 None 717417 Not Hispanic or Latino M 8BEEFFD7-7B53-4BA1-ABC2-9B244E81ADA1 Black or African American [{u'Concept_ID': 41256004, u'Description': u'P... 4.0 60609 [{u'OD_Macula2_Modifer': nan, u'OS_SL_LENS': u... [{u'Code': u'416855002', u'Family_History': u'... 8.0 [{u'Systemic_Diseases': u'Diabetes Type II', u... 8.0 Black or African American
884751 None 714436 Unknown Ethnicity M 9CD7E60C-2E0A-4E25-84C0-2440F8C83FC3 White [{u'Concept_ID': 59276001, u'Description': u'P... 4.0 60608 [{u'OD_Macula2_Modifer': nan, u'OS_SL_LENS': u... [{u'Code': u'439724007', u'Family_History': u'... 2.0 [{u'Systemic_Diseases': u'Diabetic retinopathy... 8.0 White
819216 None 727602 Hispanic or Latino F 52C478F4-B3F2-457C-8A72-E18458BCCB96 Hispanic [{u'Concept_ID': 68478007, u'Description': u'C... 1.0 60647 [{u'OD_Macula2_Modifer': nan, u'OS_SL_LENS': u... [{u'Code': u'NULL', u'Family_History': u'Diabe... 2.0 [{u'Systemic_Diseases': u'Diabetes mellitus ty... 2.0 Hispanic or Latino

In [29]:
# to access encounters for a given Person_Nbr
# these objects have nested encounter related info like all_encounter_data, systemic_disease_for_Enc, family_hist_for_Enc
combined_data_df.loc[109227, 'encounters']


Out[29]:
[{u'Enc_Date': 736174,
  u'Enc_ID': '47c5d7b3-76c9-3798-7207-9adff688646a',
  'Enc_Nbr': 4086734L,
  u'Enc_Timestamp': 736174,
  'ICD_for_Enc': [{u'Description': 'Hypermetropia bilateral',
    u'Diagnosis_Code_ID': 'H52.03'},
   {u'Description': 'Type II diabetes without complications',
    u'Diagnosis_Code_ID': 'E11.9'},
   {u'Description': 'Blepharitis of rt upper eyelid',
    u'Diagnosis_Code_ID': 'H01.001'}],
  'ICD_for_Enc_count': 3,
  u'OD_Macula1_Finding': 'flat no hemorrhages exudates pigmentary changes or no macular edema',
  u'OD_Macula1_Location': nan,
  u'OD_Macula1_Modifer': nan,
  u'OD_Macula2_Finding': nan,
  u'OD_Macula2_Location': nan,
  u'OD_Macula2_Modifer': nan,
  u'OD_Macula3_Finding': nan,
  u'OD_Macula3_Location': nan,
  u'OD_Macula3_Modifer': nan,
  u'OD_Macula4_Finding': nan,
  u'OD_Macula4_Location': nan,
  u'OD_Macula4_Modifer': nan,
  u'OD_SL_LENS': nan,
  u'OS_Macula1_Finding': 'flat no hemorrhages exudates pigmentary changes or no macular edema',
  u'OS_Macula1_Location': nan,
  u'OS_Macula1_Modifer': nan,
  u'OS_Macula2_Finding': nan,
  u'OS_Macula2_Location': nan,
  u'OS_Macula2_Modifer': nan,
  u'OS_Macula3_Finding': nan,
  u'OS_Macula3_Location': nan,
  u'OS_Macula3_Modifer': nan,
  u'OS_Macula4_Finding': nan,
  u'OS_Macula4_Location': nan,
  u'OS_Macula4_Modifer': nan,
  u'OS_SL_LENS': nan,
  u'Person_ID': 'E6121EF6-660A-4A34-B4B4-ECB513E3C916',
  u'Person_Nbr': 109227L,
  'all_encounter_data': [{u'A1C': nan,
    u'BB_OD_AXIS': nan,
    u'BB_OD_CYL': nan,
    u'BB_OD_DVA': nan,
    u'BB_OD_NVA': nan,
    u'BB_OD_SPH': nan,
    u'BB_OS_AXIS': nan,
    u'BB_OS_CYL': nan,
    u'BB_OS_DVA': nan,
    u'BB_OS_NVA': nan,
    u'BB_OS_SPH': nan,
    u'BMI': nan,
    u'BP': '170/ 90',
    u'CYCLO_OD_AXIS': nan,
    u'CYCLO_OD_CYL': nan,
    u'CYCLO_OD_DVA': nan,
    u'CYCLO_OD_NVA': nan,
    u'CYCLO_OD_SPH': nan,
    u'CYCLO_OS_AXIS': nan,
    u'CYCLO_OS_CYL': nan,
    u'CYCLO_OS_DVA': nan,
    u'CYCLO_OS_NVA': nan,
    u'CYCLO_OS_SPH': nan,
    u'Glucose': nan,
    u'MR_OD_AXIS': '115',
    u'MR_OD_CYL': '-0.50',
    u'MR_OD_DVA': '20/20',
    u'MR_OD_NVA': '20/20-',
    u'MR_OD_SPH': '+0.50',
    u'MR_OS_AXIS': '100',
    u'MR_OS_CYL': '-0.25',
    u'MR_OS_DVA': '20/20',
    u'MR_OS_NVA': '20/20',
    u'MR_OS_SPH': '+0.75',
    u'Primary_Payer': 'United Health Care PPO',
    u'Smoking_Status': 'Never smoker'}],
  'all_encounter_data_count': 1,
  'family_hist_for_Enc': [{u'Code': '430679000',
    u'Code_System': 'SNOMED',
    u'Family_History': 'Diabetes mellitus type 2',
    u'Relation': 'Brother'},
   {u'Code': '160267000',
    u'Code_System': 'SNOMED',
    u'Family_History': 'No history of Glaucoma',
    u'Relation': 'Mother'},
   {u'Code': '160267000',
    u'Code_System': 'SNOMED',
    u'Family_History': 'No history of Glaucoma',
    u'Relation': 'Father'},
   {u'Code': '160357008',
    u'Code_System': 'SNOMED',
    u'Family_History': 'Hypertension',
    u'Relation': 'Mother'},
   {u'Code': '416855002',
    u'Code_System': 'SNOMED',
    u'Family_History': 'Diabetes mellitus',
    u'Relation': 'Mother'},
   {u'Code': '160274005',
    u'Code_System': 'SNOMED',
    u'Family_History': 'No history of Diabetes mellitus',
    u'Relation': 'Father'}],
  'family_hist_for_Enc_count': 6,
  'systemic_disease_for_Enc': [{u'Snomed_Code': '44054006',
    u'Systemic_Diseases': 'Diabetes type 2'},
   {u'Snomed_Code': '56265001', u'Systemic_Diseases': 'Heart disease'},
   {u'Snomed_Code': '13644009', u'Systemic_Diseases': 'Hypercholesterolaemia'},
   {u'Snomed_Code': '38341003', u'Systemic_Diseases': 'Hypertension'},
   {u'Snomed_Code': '230690007', u'Systemic_Diseases': 'Stroke'}],
  'systemic_disease_for_Enc_count': 5}]

In [25]:
combined_data_df.to_pickle(path+'20170226_ICO_Data_combined.pickle')