In [1]:
    
import pandas as pd
import numpy as np
from pandas import DataFrame
import os
import reader
    
In [32]:
    
path = os.getcwd()+'/data/'
data = reader.Data(path)
    
    
In [33]:
    
data['demographics'].head()
    
    Out[33]:
In [4]:
    
data['family_hist_list'].head()
    
    Out[4]:
There are 443 different relationships
In [53]:
    
temp=list(data['family_hist_list'][data['family_hist_list'].Relation.notnull()].Relation.drop_duplicates())
len(temp)
    
    Out[53]:
In [6]:
    
data['encounters'].head()
    
    Out[6]:
In [7]:
    
# Create Date variable
from datetime import datetime
data['family_hist_list']['Date'] = [datetime.strftime(item, '%Y-%m-%d') for item in data['family_hist_list']['Date_Created']]
    
In [8]:
    
# Individual family history grouped by the relationship
# Date of collecting could be omited
family_hist_list = {k:[{'Relation':k1,
                        'History':[{'Code': a, 'Family_history': b} for a,b in zip(v1.Code, v1.Family_History)]}
              for k1, v1 in v.groupby('Relation')]
           for k,v in data['family_hist_list'].groupby('Person_Nbr')}
    
In [9]:
    
family_hist_list[109227]
    
    Out[9]:
In [10]:
    
# There is no person duplicated in demographics
len(data['demographics'].Person_Nbr.drop_duplicates()) == len(data['demographics'])
    
    Out[10]:
In [34]:
    
# Normalize zip code with only 5 digits
def clean_zip(zip):
    if len(zip)<5:
        return 'Null'
    else:
        return zip[:5]
data['demographics']['Zip'] = demographics.Zip.map(lambda x: clean_zip(x))
    
In [35]:
    
data['demographics'].head()
    
    Out[35]:
In [43]:
    
# Null cases for zip code
data['demographics'][data['demographics'].Zip=='Null']
    
    Out[43]:
In [44]:
    
data['demographics'].to_pickle(path+'demographics_processed_Dan_20170304.pickle')
    
In [49]:
    
data['demographics']['Age']=data['demographics']['DOB'].map(lambda x: datetime.now().year - x.year)
    
In [50]:
    
demographics=data['demographics'].set_index('Person_Nbr')[['Age', 'Gender', 'Race', 'Ethnicity', 'Zip', 'Age_Censored']].T.to_dict()
    
In [52]:
    
demographics[109227]
    
    Out[52]:
In [53]:
    
# People in demographics have fully covered people in family_hist_list
set(demographics.keys())&set(family_hist_list.keys())==set(family_hist_list)
    
    Out[53]:
In [20]:
    
# Create Date variable
#data['encounters']['Enc_Date'] = pd.to_datetime([datetime.strftime(item, '%Y-%m-%d') for item in data['encounters']['Enc_Timestamp']])
    
In [54]:
    
Enc_list = {k:sorted([{'Enc_Nbr': a, 'Enc_Date': b} for a,b in zip(v.Enc_Nbr, v.Enc_Timestamp)], key=lambda x:x['Enc_Date']) for k,v in data['encounters'].groupby('Person_Nbr')}
    
In [55]:
    
Enc_list[109227]
    
    Out[55]:
In [56]:
    
set(Enc_list.keys())&set(demographics.keys())==set(Enc_list)
# People in demographics have fully covered people in encouters
    
    Out[56]:
In [57]:
    
set(Enc_list)&set(family_hist_list) == set(family_hist_list)
# People has family history record must have encounter records, encounters fully cover family_hist_list
    
    Out[57]:
In [58]:
    
profile_full={}
for k,v in demographics.items():
    profile_full[k]=v
    
    # patint may or may nor have a family history
    profile_full[k]['family_hist_list'] = {}
    profile_full[k]['family_hist_list_count'] = 0
    if k in family_hist_list.keys():
        profile_full[k]['family_hist_list'] = family_hist_list[k]
        profile_full[k]['family_hist_list_count'] = len(family_hist_list[k])
    
    # patient may or may not have encounter records
    profile_full[k]['Enc_list'] = {}
    profile_full[k]['Enc_list_count'] = 0
    profile_full[k]['Enc_list_span'] = 0
    if k in Enc_list.keys():
        profile_full[k]['Enc_list'] = Enc_list[k]
        profile_full[k]['Enc_list_count'] = len(Enc_list[k])
        profile_full[k]['Enc_list_span'] = datetime.now().year - int(datetime.strftime(datetime.date(Enc_list[k][0]['Enc_Date']), '%Y'))
    
In [59]:
    
profile_full[109227]
    
    Out[59]:
In [58]:
    
len(profile_full)
    
    Out[58]:
Shall we remove the profiles that has no history and no encounters?
In [26]:
    
len(set(demographics)-set(family_hist_list))
# That is to remove 2975 patients
    
    Out[26]:
In [27]:
    
len(set(family_hist_list)&set(Enc_list))
# So that only 14044 patients left have both records
    
    Out[27]:
In [28]:
    
len(set(demographics)-set(Enc_list))
    
    Out[28]:
We have in total 17019 patients. 510 of total have no encounter records. 2465 of total have no encounter records and no family records. If we want a profile that everyone has both records, we need to remove all 2975 patients.
In [60]:
    
# Remove patients have no family or no encounter records
profile={}
for k,v in demographics.items():
    if k in set(family_hist_list)&set(Enc_list):
        profile[k]=v
        
        profile[k]['family_hist_list'] = family_hist_list[k]
        profile[k]['family_hist_list_count'] = len(family_hist_list[k])
        profile[k]['Enc_list'] = Enc_list[k]
        profile[k]['Enc_list_count'] = len(Enc_list[k])
        profile_full[k]['Enc_list_span'] = datetime.now().year - int(datetime.strftime(datetime.date(Enc_list[k][0]['Enc_Date']), '%Y'))
    else:
        continue
    
In [61]:
    
profile[109227]
    
    Out[61]:
In [31]:
    
len(profile)
    
    Out[31]:
(Or we can only remove 510 patients that lack encounter records and omit family part for now)
In [62]:
    
# Remove patients have no encounter records
profile1={}
for k,v in demographics.items():
    if k in set(Enc_list):
        profile1[k]=v
        profile1[k]['Enc_list'] = Enc_list[k]
        profile1[k]['Enc_list_count'] = len(Enc_list[k])
        profile_full[k]['Enc_list_span'] = datetime.now().year - int(datetime.strftime(datetime.date(Enc_list[k][0]['Enc_Date']), '%Y'))
        
        profile1[k]['family_hist_list'] = {}
        profile1[k]['family_hist_list_count'] = 0
        
        if k in family_hist_list.keys():
            profile1[k]['family_hist_list'] = family_hist_list[k]
            profile1[k]['family_hist_list_count'] = len(family_hist_list[k])
    else:
        continue
    
In [63]:
    
profile1[109227]
    
    Out[63]:
In [34]:
    
len(profile1)
    
    Out[34]:
In [66]:
    
temp=pd.DataFrame.from_dict(profile1, orient='index')
temp.head()
    
    Out[66]:
In [67]:
    
temp.to_pickle(path+'person_profile_df.pickle')
    
In [36]:
    
data['SNOMED_problem_list'].head()
    
    Out[36]:
In [64]:
    
{k:list(v) for k,v in data['systemic_disease_list'].groupby('Person_Nbr')['Snomed_Code']}[109227]
    
    Out[64]:
In [65]:
    
{k:list(v) for k,v in data['SNOMED_problem_list'].groupby('Person_Nbr')['Concept_ID']}[109227]
    
    Out[65]:
In [ ]: