In [2]:
%matplotlib inline
import pandas as pd
from pandas import DataFrame
import datetime
import numpy as np
from collections import Counter
In [3]:
table_names = ['all_encounter_data', 'demographics', 'encounters',
'family_hist_for_Enc','family_hist_list',
'ICD_for_Enc',
'macula_findings_for_Enc','SL_Lens_for_Enc',
'SNOMED_problem_list', 'systemic_disease_for_Enc', 'systemic_disease_list']
In [5]:
path = 'ICO_data/'
dfs = [pd.read_pickle(path + name + '.pickle') if name != 'ICD_list' else None
for name in table_names]
In [6]:
duplicated_enc=sorted({k:list(v) for k,v in dfs[0].groupby('Enc_Nbr')['Person_Nbr'] if len(v)>1}.items(),
key=lambda x:len(x[1]), reverse=True)
print('Duplicated encounter amount:', len(duplicated_enc))
print('Most frequency: ', len(duplicated_enc[0][1]))
print('Top 10 frequent encounter: \n', 'Enc_Nbr\tduplicated Person_Nbr list')
import pprint
pprint.pprint(duplicated_enc[0:10])
print('Example of the encounter with most frequent duplicated occurence.')
dfs[0][dfs[0].Enc_Nbr==duplicated_enc[0][0]].head()
Out[6]:
In [7]:
print('Person_ID column name of', table_names[1], ': ',dfs[1].columns.values[0])
print('Person_ID column name of', table_names[2], ': ',dfs[2].columns.values[0])
person_dict_correct={k:list(v) for k,v in
dfs[0][['Person_ID','Person_Nbr']].drop_duplicates().groupby('Person_Nbr')['Person_ID']}
person_dict_demographics={k:list(v) for k,v in
dfs[1].ix[:,0:2].drop_duplicates().groupby('Person_Nbr')[dfs[1].columns.values[0]]}
compare_demographics={k:v==person_dict_demographics[k] for k,v in
person_dict_correct.items() if k in person_dict_demographics.keys()}
print('Look up Person_ID in all_encounter_data and demographics, and compare its related Person_Nbr to see if equal')
from collections import Counter
print('result is:')
print(Counter(compare_demographics.values()))
person_dict_encounters={k:list(v) for k,v in
dfs[2].ix[:,0:2].drop_duplicates().groupby('Person_Nbr')[dfs[2].columns.values[0]]}
compare_encounters={k:v==person_dict_encounters[k] for k,v in
person_dict_correct.items() if k in person_dict_encounters.keys()}
print('Look up Person_ID in all_encounter_data and encounters, and compare its related Person_Nbr to see if equal')
print('result is:')
print(Counter(compare_encounters.values()))
print('It indicates that the whole column of person id in two tables are wrong')
In [9]:
# Check all of the encounter ID's that show up multiple times in the table for where values are unique.
d_enc = dfs[0]
unique_key_count = {}
for key in d_enc[d_enc.duplicated(subset="Enc_ID")]["Enc_ID"].unique():
table=d_enc[d_enc["Enc_ID"]==key]
for column in list(d_enc):
if table[column].unique().size != 1:
unique_key_count[column] = unique_key_count.setdefault(column,0)+1
#print d_enc[d_enc["Enc_ID"]==key].loc[:,["Enc_ID",column]]
unique_key_count
Out[9]:
In [22]:
import re
pattern = re.compile("\d+\s*\/\s*\d+")
dfs[0][dfs[0]['Glucose'].str.contains(pattern, na=False)].head()
Out[22]: