In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
import os
import reader
In [32]:
path = os.getcwd()+'/data/'
data = reader.Data(path)
In [33]:
data['demographics'].head()
Out[33]:
In [4]:
data['family_hist_list'].head()
Out[4]:
There are 443 different relationships
In [53]:
temp=list(data['family_hist_list'][data['family_hist_list'].Relation.notnull()].Relation.drop_duplicates())
len(temp)
Out[53]:
In [6]:
data['encounters'].head()
Out[6]:
In [7]:
# Create Date variable
from datetime import datetime
data['family_hist_list']['Date'] = [datetime.strftime(item, '%Y-%m-%d') for item in data['family_hist_list']['Date_Created']]
In [8]:
# Individual family history grouped by the relationship
# Date of collecting could be omited
family_hist_list = {k:[{'Relation':k1,
'History':[{'Code': a, 'Family_history': b} for a,b in zip(v1.Code, v1.Family_History)]}
for k1, v1 in v.groupby('Relation')]
for k,v in data['family_hist_list'].groupby('Person_Nbr')}
In [9]:
family_hist_list[109227]
Out[9]:
In [10]:
# There is no person duplicated in demographics
len(data['demographics'].Person_Nbr.drop_duplicates()) == len(data['demographics'])
Out[10]:
In [34]:
# Normalize zip code with only 5 digits
def clean_zip(zip):
if len(zip)<5:
return 'Null'
else:
return zip[:5]
data['demographics']['Zip'] = demographics.Zip.map(lambda x: clean_zip(x))
In [35]:
data['demographics'].head()
Out[35]:
In [43]:
# Null cases for zip code
data['demographics'][data['demographics'].Zip=='Null']
Out[43]:
In [44]:
data['demographics'].to_pickle(path+'demographics_processed_Dan_20170304.pickle')
In [49]:
data['demographics']['Age']=data['demographics']['DOB'].map(lambda x: datetime.now().year - x.year)
In [50]:
demographics=data['demographics'].set_index('Person_Nbr')[['Age', 'Gender', 'Race', 'Ethnicity', 'Zip', 'Age_Censored']].T.to_dict()
In [52]:
demographics[109227]
Out[52]:
In [53]:
# People in demographics have fully covered people in family_hist_list
set(demographics.keys())&set(family_hist_list.keys())==set(family_hist_list)
Out[53]:
In [20]:
# Create Date variable
#data['encounters']['Enc_Date'] = pd.to_datetime([datetime.strftime(item, '%Y-%m-%d') for item in data['encounters']['Enc_Timestamp']])
In [54]:
Enc_list = {k:sorted([{'Enc_Nbr': a, 'Enc_Date': b} for a,b in zip(v.Enc_Nbr, v.Enc_Timestamp)], key=lambda x:x['Enc_Date']) for k,v in data['encounters'].groupby('Person_Nbr')}
In [55]:
Enc_list[109227]
Out[55]:
In [56]:
set(Enc_list.keys())&set(demographics.keys())==set(Enc_list)
# People in demographics have fully covered people in encouters
Out[56]:
In [57]:
set(Enc_list)&set(family_hist_list) == set(family_hist_list)
# People has family history record must have encounter records, encounters fully cover family_hist_list
Out[57]:
In [58]:
profile_full={}
for k,v in demographics.items():
profile_full[k]=v
# patint may or may nor have a family history
profile_full[k]['family_hist_list'] = {}
profile_full[k]['family_hist_list_count'] = 0
if k in family_hist_list.keys():
profile_full[k]['family_hist_list'] = family_hist_list[k]
profile_full[k]['family_hist_list_count'] = len(family_hist_list[k])
# patient may or may not have encounter records
profile_full[k]['Enc_list'] = {}
profile_full[k]['Enc_list_count'] = 0
profile_full[k]['Enc_list_span'] = 0
if k in Enc_list.keys():
profile_full[k]['Enc_list'] = Enc_list[k]
profile_full[k]['Enc_list_count'] = len(Enc_list[k])
profile_full[k]['Enc_list_span'] = datetime.now().year - int(datetime.strftime(datetime.date(Enc_list[k][0]['Enc_Date']), '%Y'))
In [59]:
profile_full[109227]
Out[59]:
In [58]:
len(profile_full)
Out[58]:
Shall we remove the profiles that has no history and no encounters?
In [26]:
len(set(demographics)-set(family_hist_list))
# That is to remove 2975 patients
Out[26]:
In [27]:
len(set(family_hist_list)&set(Enc_list))
# So that only 14044 patients left have both records
Out[27]:
In [28]:
len(set(demographics)-set(Enc_list))
Out[28]:
We have in total 17019 patients. 510 of total have no encounter records. 2465 of total have no encounter records and no family records. If we want a profile that everyone has both records, we need to remove all 2975 patients.
In [60]:
# Remove patients have no family or no encounter records
profile={}
for k,v in demographics.items():
if k in set(family_hist_list)&set(Enc_list):
profile[k]=v
profile[k]['family_hist_list'] = family_hist_list[k]
profile[k]['family_hist_list_count'] = len(family_hist_list[k])
profile[k]['Enc_list'] = Enc_list[k]
profile[k]['Enc_list_count'] = len(Enc_list[k])
profile_full[k]['Enc_list_span'] = datetime.now().year - int(datetime.strftime(datetime.date(Enc_list[k][0]['Enc_Date']), '%Y'))
else:
continue
In [61]:
profile[109227]
Out[61]:
In [31]:
len(profile)
Out[31]:
(Or we can only remove 510 patients that lack encounter records and omit family part for now)
In [62]:
# Remove patients have no encounter records
profile1={}
for k,v in demographics.items():
if k in set(Enc_list):
profile1[k]=v
profile1[k]['Enc_list'] = Enc_list[k]
profile1[k]['Enc_list_count'] = len(Enc_list[k])
profile_full[k]['Enc_list_span'] = datetime.now().year - int(datetime.strftime(datetime.date(Enc_list[k][0]['Enc_Date']), '%Y'))
profile1[k]['family_hist_list'] = {}
profile1[k]['family_hist_list_count'] = 0
if k in family_hist_list.keys():
profile1[k]['family_hist_list'] = family_hist_list[k]
profile1[k]['family_hist_list_count'] = len(family_hist_list[k])
else:
continue
In [63]:
profile1[109227]
Out[63]:
In [34]:
len(profile1)
Out[34]:
In [66]:
temp=pd.DataFrame.from_dict(profile1, orient='index')
temp.head()
Out[66]:
In [67]:
temp.to_pickle(path+'person_profile_df.pickle')
In [36]:
data['SNOMED_problem_list'].head()
Out[36]:
In [64]:
{k:list(v) for k,v in data['systemic_disease_list'].groupby('Person_Nbr')['Snomed_Code']}[109227]
Out[64]:
In [65]:
{k:list(v) for k,v in data['SNOMED_problem_list'].groupby('Person_Nbr')['Concept_ID']}[109227]
Out[65]:
In [ ]: