In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
import os
import reader

In [32]:
path = os.getcwd()+'/data/'
data = reader.Data(path)


Local data read/write folder path:
	Customed path: /Users/Dan/百度云同步盘/丝打底/2017 spring/MATH 497/code and data/data/

Data: systemic_disease_list 
File: systemic_disease_list.pickle
File already exists.

Data: SNOMED_problem_list 
File: SNOMED_problem_list.pickle
File already exists.

Data: macula_findings_for_Enc 
File: macula_findings_for_Enc.pickle
File already exists.

Data: SL_Lens_for_Enc 
File: SL_Lens_for_Enc.pickle
File already exists.

Data: family_hist_list 
File: family_hist_list.pickle
File already exists.

Data: systemic_disease_for_Enc 
File: systemic_disease_for_Enc.pickle
File already exists.

Data: family_hist_for_Enc 
File: family_hist_for_Enc.pickle
File already exists.

Data: all_encounter_data 
File: all_encounter_data.pickle
File already exists.

Data: encounters 
File: encounters.pickle
File already exists.

Data: demographics 
File: demographics.pickle
File already exists.

Data: ICD_for_Enc 
File: ICD_for_Enc.pickle
File already exists.

In [33]:
data['demographics'].head()


Out[33]:
Person_ID Person_Nbr DOB Gender Race Ethnicity Zip Age_Censored
14132 18405351-AC64-46A2-A003-8F7889351A13 33 1948-10-15 F Black/African American (Not Hispanic) Declined to specify 60616 None
8747 4ACD51E8-4A9B-4AA6-B635-166ADA5EF02E 89 1955-09-04 M Declined to specify Declined to specify 60652 None
2129 DF25CE0F-E45A-44D2-BE43-F49FE4F5E280 112 1932-11-19 F NaN NaN 606372549 None
6591 7027DD58-6465-496C-880A-A6E825C155A8 146 1948-12-19 F Declined to specify Declined to specify 60615 None
16909 395DD5E7-45F9-456E-A127-9AB929E872F7 196 1955-02-21 F Declined to specify Declined to specify 60615 None

In [4]:
data['family_hist_list'].head()


Out[4]:
Person_ID Person_Nbr Date_Created Code Code_System Family_History Relation
47872 80d3df88-dddf-5ad3-7cc1-b7b1ac6151fa 33 2014-12-18 14:47:15.980 160347007 SNOMED Glaucoma Father
47869 80d3df88-dddf-5ad3-7cc1-b7b1ac6151fa 33 2014-12-18 14:47:26.897 160274005 SNOMED No history of Diabetes mellitus Mother
47867 80d3df88-dddf-5ad3-7cc1-b7b1ac6151fa 33 2014-12-18 14:47:28.590 160274005 SNOMED No history of Diabetes mellitus Father
47871 80d3df88-dddf-5ad3-7cc1-b7b1ac6151fa 33 2014-12-18 14:47:29.563 160267000 SNOMED No history of Glaucoma Mother
47868 80d3df88-dddf-5ad3-7cc1-b7b1ac6151fa 33 2014-12-18 15:55:35.837 439724007 SNOMED Heart disease Brother

There are 443 different relationships


In [53]:
temp=list(data['family_hist_list'][data['family_hist_list'].Relation.notnull()].Relation.drop_duplicates())
len(temp)


Out[53]:
443

In [6]:
data['encounters'].head()


Out[6]:
Person_ID Person_Nbr Enc_ID Enc_Nbr Enc_Timestamp
10335 18405351-AC64-46A2-A003-8F7889351A13 33 97825c51-4462-eade-1c1d-2baa3400c033 123227 2014-12-18 14:00:00
59383 4ACD51E8-4A9B-4AA6-B635-166ADA5EF02E 89 a6d9d991-4ff4-79b5-304d-f1b2bff9d6d0 12870648 2012-10-16 03:45:00
56011 4ACD51E8-4A9B-4AA6-B635-166ADA5EF02E 89 fe5eb87c-39b3-4314-0172-4642d51de417 4126172 2014-03-12 15:30:00
78800 4ACD51E8-4A9B-4AA6-B635-166ADA5EF02E 89 ea3c2622-6d1d-53ab-ddf0-9764927f1507 9217142 2014-04-17 18:45:00
3870 4ACD51E8-4A9B-4AA6-B635-166ADA5EF02E 89 71d10fa4-3b0d-fa52-cfbd-ffd0f3d3532d 14968450 2014-05-25 10:45:00

Process family history


In [7]:
# Create Date variable
from datetime import datetime
data['family_hist_list']['Date'] = [datetime.strftime(item, '%Y-%m-%d') for item in data['family_hist_list']['Date_Created']]

In [8]:
# Individual family history grouped by the relationship
# Date of collecting could be omited
family_hist_list = {k:[{'Relation':k1,
                        'History':[{'Code': a, 'Family_history': b} for a,b in zip(v1.Code, v1.Family_History)]}
              for k1, v1 in v.groupby('Relation')]
           for k,v in data['family_hist_list'].groupby('Person_Nbr')}

In [9]:
family_hist_list[109227]


Out[9]:
[{'History': [{'Code': '430679000',
    'Family_history': 'Diabetes mellitus type 2'}],
  'Relation': 'Brother'},
 {'History': [{'Code': '160274005',
    'Family_history': 'No history of Diabetes mellitus'},
   {'Code': '160267000', 'Family_history': 'No history of Glaucoma'}],
  'Relation': 'Father'},
 {'History': [{'Code': '416855002', 'Family_history': 'Diabetes mellitus'},
   {'Code': '160267000', 'Family_history': 'No history of Glaucoma'},
   {'Code': '160357008', 'Family_history': 'Hypertension'}],
  'Relation': 'Mother'}]

Process demographics


In [10]:
# There is no person duplicated in demographics
len(data['demographics'].Person_Nbr.drop_duplicates()) == len(data['demographics'])


Out[10]:
True

In [34]:
# Normalize zip code with only 5 digits
def clean_zip(zip):
    if len(zip)<5:
        return 'Null'
    else:
        return zip[:5]
data['demographics']['Zip'] = demographics.Zip.map(lambda x: clean_zip(x))

In [35]:
data['demographics'].head()


Out[35]:
Person_ID Person_Nbr DOB Gender Race Ethnicity Zip Age_Censored
14132 18405351-AC64-46A2-A003-8F7889351A13 33 1948-10-15 F Black/African American (Not Hispanic) Declined to specify 60616 None
8747 4ACD51E8-4A9B-4AA6-B635-166ADA5EF02E 89 1955-09-04 M Declined to specify Declined to specify 60652 None
2129 DF25CE0F-E45A-44D2-BE43-F49FE4F5E280 112 1932-11-19 F NaN NaN 60637 None
6591 7027DD58-6465-496C-880A-A6E825C155A8 146 1948-12-19 F Declined to specify Declined to specify 60615 None
16909 395DD5E7-45F9-456E-A127-9AB929E872F7 196 1955-02-21 F Declined to specify Declined to specify 60615 None

In [43]:
# Null cases for zip code
data['demographics'][data['demographics'].Zip=='Null']


Out[43]:
Person_ID Person_Nbr DOB Gender Race Ethnicity Zip Age_Censored
7038 DEC00BF4-F6F8-46F2-87D9-B1F94E16EB6B 16320 1939-04-09 F Declined to specify Declined to specify Null None
12820 7286E85A-473F-4C94-A684-489E6932E02A 211308 1975-01-26 M Declined to specify Unknown Ethnicity Null None
10604 663B54A6-C6DD-4858-AE78-8AFDA98E78F4 496337 1959-08-05 F NaN Unknown Ethnicity Null None
2065 4CCDD77B-12DF-4943-BA78-F387C3AD2182 542734 1927-05-07 M Black/African American (Not Hispanic) Not Hispanic or Latino Null None
10147 0B7013D0-89BD-45C4-B615-E8DE442CC984 663472 1980-01-05 M Alaskan Native Hispanic or Latino Null None
11573 1D4E4067-929B-4687-A38F-C69D0AD4DF2D 782993 1951-07-26 F Black or African American Not Hispanic or Latino Null None
11355 6000BEE0-6407-4EA2-A88E-53E7C6C796CF 849189 1971-10-02 M Black or African American Not Hispanic or Latino Null None
7264 6556347D-22F7-4D17-9917-976441C5FBCD 850308 1901-01-06 F White Not Hispanic or Latino Null None
15440 50C8AC9E-D3D4-4279-BC5C-50EC7CEEC2F6 1014237 1966-10-02 M Declined to specify Declined to specify Null None
9073 DA5EECE7-B999-4FBA-AB93-C4E5F0CB8BA1 1041124 2011-10-30 M Asian Not Hispanic or Latino Null All

In [44]:
data['demographics'].to_pickle(path+'demographics_processed_Dan_20170304.pickle')

In [49]:
data['demographics']['Age']=data['demographics']['DOB'].map(lambda x: datetime.now().year - x.year)

In [50]:
demographics=data['demographics'].set_index('Person_Nbr')[['Age', 'Gender', 'Race', 'Ethnicity', 'Zip', 'Age_Censored']].T.to_dict()

In [52]:
demographics[109227]


Out[52]:
{'Age': 61,
 'Age_Censored': 'None',
 'Ethnicity': 'Not Hispanic or Latino',
 'Gender': 'F',
 'Race': 'Black/African American (Not Hispanic)',
 'Zip': '60419'}

In [53]:
# People in demographics have fully covered people in family_hist_list
set(demographics.keys())&set(family_hist_list.keys())==set(family_hist_list)


Out[53]:
True

Process encounter list


In [20]:
# Create Date variable
#data['encounters']['Enc_Date'] = pd.to_datetime([datetime.strftime(item, '%Y-%m-%d') for item in data['encounters']['Enc_Timestamp']])

In [54]:
Enc_list = {k:sorted([{'Enc_Nbr': a, 'Enc_Date': b} for a,b in zip(v.Enc_Nbr, v.Enc_Timestamp)], key=lambda x:x['Enc_Date']) for k,v in data['encounters'].groupby('Person_Nbr')}

In [55]:
Enc_list[109227]


Out[55]:
[{'Enc_Date': Timestamp('2016-07-29 12:30:00'), 'Enc_Nbr': 4086734}]

In [56]:
set(Enc_list.keys())&set(demographics.keys())==set(Enc_list)
# People in demographics have fully covered people in encouters


Out[56]:
True

In [57]:
set(Enc_list)&set(family_hist_list) == set(family_hist_list)
# People has family history record must have encounter records, encounters fully cover family_hist_list


Out[57]:
True

Merge into a dictionary of profile


In [58]:
profile_full={}
for k,v in demographics.items():
    profile_full[k]=v
    
    # patint may or may nor have a family history
    profile_full[k]['family_hist_list'] = {}
    profile_full[k]['family_hist_list_count'] = 0
    if k in family_hist_list.keys():
        profile_full[k]['family_hist_list'] = family_hist_list[k]
        profile_full[k]['family_hist_list_count'] = len(family_hist_list[k])
    
    # patient may or may not have encounter records
    profile_full[k]['Enc_list'] = {}
    profile_full[k]['Enc_list_count'] = 0
    profile_full[k]['Enc_list_span'] = 0

    if k in Enc_list.keys():
        profile_full[k]['Enc_list'] = Enc_list[k]
        profile_full[k]['Enc_list_count'] = len(Enc_list[k])
        profile_full[k]['Enc_list_span'] = datetime.now().year - int(datetime.strftime(datetime.date(Enc_list[k][0]['Enc_Date']), '%Y'))

In [59]:
profile_full[109227]


Out[59]:
{'Age': 61,
 'Age_Censored': 'None',
 'Enc_list': [{'Enc_Date': Timestamp('2016-07-29 12:30:00'),
   'Enc_Nbr': 4086734}],
 'Enc_list_count': 1,
 'Enc_list_span': 1,
 'Ethnicity': 'Not Hispanic or Latino',
 'Gender': 'F',
 'Race': 'Black/African American (Not Hispanic)',
 'Zip': '60419',
 'family_hist_list': [{'History': [{'Code': '430679000',
     'Family_history': 'Diabetes mellitus type 2'}],
   'Relation': 'Brother'},
  {'History': [{'Code': '160274005',
     'Family_history': 'No history of Diabetes mellitus'},
    {'Code': '160267000', 'Family_history': 'No history of Glaucoma'}],
   'Relation': 'Father'},
  {'History': [{'Code': '416855002', 'Family_history': 'Diabetes mellitus'},
    {'Code': '160267000', 'Family_history': 'No history of Glaucoma'},
    {'Code': '160357008', 'Family_history': 'Hypertension'}],
   'Relation': 'Mother'}],
 'family_hist_list_count': 3}

In [58]:
len(profile_full)


Out[58]:
17019

Shall we remove the profiles that has no history and no encounters?


In [26]:
len(set(demographics)-set(family_hist_list))
# That is to remove 2975 patients


Out[26]:
2975

In [27]:
len(set(family_hist_list)&set(Enc_list))
# So that only 14044 patients left have both records


Out[27]:
14044

In [28]:
len(set(demographics)-set(Enc_list))


Out[28]:
510

We have in total 17019 patients. 510 of total have no encounter records. 2465 of total have no encounter records and no family records. If we want a profile that everyone has both records, we need to remove all 2975 patients.


In [60]:
# Remove patients have no family or no encounter records
profile={}
for k,v in demographics.items():
    if k in set(family_hist_list)&set(Enc_list):
        profile[k]=v
        
        profile[k]['family_hist_list'] = family_hist_list[k]
        profile[k]['family_hist_list_count'] = len(family_hist_list[k])

        profile[k]['Enc_list'] = Enc_list[k]
        profile[k]['Enc_list_count'] = len(Enc_list[k])
        profile_full[k]['Enc_list_span'] = datetime.now().year - int(datetime.strftime(datetime.date(Enc_list[k][0]['Enc_Date']), '%Y'))
    else:
        continue

In [61]:
profile[109227]


Out[61]:
{'Age': 61,
 'Age_Censored': 'None',
 'Enc_list': [{'Enc_Date': Timestamp('2016-07-29 12:30:00'),
   'Enc_Nbr': 4086734}],
 'Enc_list_count': 1,
 'Enc_list_span': 1,
 'Ethnicity': 'Not Hispanic or Latino',
 'Gender': 'F',
 'Race': 'Black/African American (Not Hispanic)',
 'Zip': '60419',
 'family_hist_list': [{'History': [{'Code': '430679000',
     'Family_history': 'Diabetes mellitus type 2'}],
   'Relation': 'Brother'},
  {'History': [{'Code': '160274005',
     'Family_history': 'No history of Diabetes mellitus'},
    {'Code': '160267000', 'Family_history': 'No history of Glaucoma'}],
   'Relation': 'Father'},
  {'History': [{'Code': '416855002', 'Family_history': 'Diabetes mellitus'},
    {'Code': '160267000', 'Family_history': 'No history of Glaucoma'},
    {'Code': '160357008', 'Family_history': 'Hypertension'}],
   'Relation': 'Mother'}],
 'family_hist_list_count': 3}

In [31]:
len(profile)


Out[31]:
14044

(Or we can only remove 510 patients that lack encounter records and omit family part for now)


In [62]:
# Remove patients have no encounter records
profile1={}
for k,v in demographics.items():
    if k in set(Enc_list):
        profile1[k]=v

        profile1[k]['Enc_list'] = Enc_list[k]
        profile1[k]['Enc_list_count'] = len(Enc_list[k])
        profile_full[k]['Enc_list_span'] = datetime.now().year - int(datetime.strftime(datetime.date(Enc_list[k][0]['Enc_Date']), '%Y'))
        
        profile1[k]['family_hist_list'] = {}
        profile1[k]['family_hist_list_count'] = 0
        
        if k in family_hist_list.keys():
            profile1[k]['family_hist_list'] = family_hist_list[k]
            profile1[k]['family_hist_list_count'] = len(family_hist_list[k])
    else:
        continue

In [63]:
profile1[109227]


Out[63]:
{'Age': 61,
 'Age_Censored': 'None',
 'Enc_list': [{'Enc_Date': Timestamp('2016-07-29 12:30:00'),
   'Enc_Nbr': 4086734}],
 'Enc_list_count': 1,
 'Enc_list_span': 1,
 'Ethnicity': 'Not Hispanic or Latino',
 'Gender': 'F',
 'Race': 'Black/African American (Not Hispanic)',
 'Zip': '60419',
 'family_hist_list': [{'History': [{'Code': '430679000',
     'Family_history': 'Diabetes mellitus type 2'}],
   'Relation': 'Brother'},
  {'History': [{'Code': '160274005',
     'Family_history': 'No history of Diabetes mellitus'},
    {'Code': '160267000', 'Family_history': 'No history of Glaucoma'}],
   'Relation': 'Father'},
  {'History': [{'Code': '416855002', 'Family_history': 'Diabetes mellitus'},
    {'Code': '160267000', 'Family_history': 'No history of Glaucoma'},
    {'Code': '160357008', 'Family_history': 'Hypertension'}],
   'Relation': 'Mother'}],
 'family_hist_list_count': 3}

In [34]:
len(profile1)


Out[34]:
16509

In [66]:
temp=pd.DataFrame.from_dict(profile1, orient='index')
temp.head()


Out[66]:
Zip Gender Age Enc_list Age_Censored family_hist_list Enc_list_count Race family_hist_list_count Enc_list_span Ethnicity
33 60616 F 69 [{u'Enc_Date': 2014-12-18 14:00:00, u'Enc_Nbr'... None [{u'Relation': u'Brother', u'History': [{'Code... 1 Black/African American (Not Hispanic) 4 3 Declined to specify
89 60652 M 62 [{u'Enc_Date': 2012-10-16 03:45:00, u'Enc_Nbr'... None [{u'Relation': u'Father', u'History': [{'Code'... 9 Declined to specify 3 5 Declined to specify
146 60615 F 69 [{u'Enc_Date': 2012-11-04 19:45:00, u'Enc_Nbr'... None [{u'Relation': u'Father', u'History': [{'Code'... 5 Declined to specify 2 5 Declined to specify
196 60615 F 62 [{u'Enc_Date': 2016-10-05 00:45:00, u'Enc_Nbr'... None [{u'Relation': u'Father', u'History': [{'Code'... 2 Declined to specify 2 1 Declined to specify
327 60411 F 73 [{u'Enc_Date': 2011-12-04 11:00:00, u'Enc_Nbr'... None {} 1 Black or African American 0 6 African American

In [67]:
temp.to_pickle(path+'person_profile_df.pickle')

Tried to process SNOMED code list for person


In [36]:
data['SNOMED_problem_list'].head()


Out[36]:
Person_ID Person_Nbr Date_Created Concept_ID Description
69610 80d3df88-dddf-5ad3-7cc1-b7b1ac6151fa 33 2014-12-18 15:51:19.607 41256004 Presbyopia
69608 80d3df88-dddf-5ad3-7cc1-b7b1ac6151fa 33 2014-12-18 15:51:28.043 41446000 Blepharitis
69609 80d3df88-dddf-5ad3-7cc1-b7b1ac6151fa 33 2014-12-18 16:36:28.083 313436004 Type 2 diabetes mellitus without complication
46510 adca6fa4-e7d4-d7f8-cf41-27056662d84b 89 2014-08-12 03:04:55.010 81416004 Open angle with borderline findings
46511 adca6fa4-e7d4-d7f8-cf41-27056662d84b 89 2014-08-12 03:04:55.010 28998008 Retinal hemorrhage

In [64]:
{k:list(v) for k,v in data['systemic_disease_list'].groupby('Person_Nbr')['Snomed_Code']}[109227]


Out[64]:
['44054006', '56265001', '38341003', '230690007', '13644009']

In [65]:
{k:list(v) for k,v in data['SNOMED_problem_list'].groupby('Person_Nbr')['Concept_ID']}[109227]


Out[65]:
[38101003, 111552007, 41446000]

In [ ]: