In [2]:
%matplotlib inline

import pandas as pd
from pandas import DataFrame
import datetime
import numpy as np
from collections import Counter

In [3]:
table_names = ['all_encounter_data', 'demographics', 'encounters', 
               'family_hist_for_Enc','family_hist_list', 
               'ICD_for_Enc', 
               'macula_findings_for_Enc','SL_Lens_for_Enc', 
               'SNOMED_problem_list', 'systemic_disease_for_Enc', 'systemic_disease_list']

In [5]:
path = 'ICO_data/'
dfs = [pd.read_pickle(path + name + '.pickle') if name != 'ICD_list' else None 
       for name in table_names]

1. Duplicated-case bug in all_encounter_data


In [6]:
duplicated_enc=sorted({k:list(v) for k,v in dfs[0].groupby('Enc_Nbr')['Person_Nbr'] if len(v)>1}.items(), 
                      key=lambda x:len(x[1]), reverse=True)
print('Duplicated encounter amount:', len(duplicated_enc))
print('Most frequency: ', len(duplicated_enc[0][1]))
print('Top 10 frequent encounter: \n', 'Enc_Nbr\tduplicated Person_Nbr list')
import pprint
pprint.pprint(duplicated_enc[0:10])
print('Example of the encounter with most frequent duplicated occurence.')
dfs[0][dfs[0].Enc_Nbr==duplicated_enc[0][0]].head()


('Duplicated encounter amount:', 1320)
('Most frequency: ', 6)
('Top 10 frequent encounter: \n', 'Enc_Nbr\tduplicated Person_Nbr list')
[(3323961, [642646, 642646, 642646, 642646, 642646, 642646]),
 (13193361, [938790, 938790, 938790, 938790, 938790, 938790]),
 (10023437, [600601, 600601, 600601, 600601, 600601, 600601]),
 (83734, [416597, 416597, 416597, 416597, 416597, 416597]),
 (1859931, [555680, 555680, 555680, 555680]),
 (3348565, [150929, 150929, 150929, 150929]),
 (7155803, [814160, 814160, 814160, 814160]),
 (6969441, [416558, 416558, 416558, 416558]),
 (3711105, [539050, 539050, 539050, 539050]),
 (5511351, [884266, 884266, 884266, 884266])]
Example of the encounter with most frequent duplicated occurence.
Out[6]:
Enc_ID Enc_Nbr Enc_Date Person_ID Person_Nbr Primary_Payer Smoking_Status BMI BP Glucose ... CYCLO_OD_SPH CYCLO_OD_CYL CYCLO_OD_AXIS CYCLO_OD_DVA CYCLO_OD_NVA CYCLO_OS_SPH CYCLO_OS_CYL CYCLO_OS_AXIS CYCLO_OS_DVA CYCLO_OS_NVA
70045 6f1f8e83-4d29-15cb-e38b-2371c66bed8b 3323961 2015-02-14 14:30:00 2d903e5a-3701-141e-7e6e-78dd4a00cd65 642646 Davis CCN Vision Plan Former smoker NaN NaN 168 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
70046 6f1f8e83-4d29-15cb-e38b-2371c66bed8b 3323961 2015-02-14 14:30:00 2d903e5a-3701-141e-7e6e-78dd4a00cd65 642646 Davis CCN Vision Plan Former smoker NaN NaN 190 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
70047 6f1f8e83-4d29-15cb-e38b-2371c66bed8b 3323961 2015-02-14 14:30:00 2d903e5a-3701-141e-7e6e-78dd4a00cd65 642646 Davis CCN Vision Plan Former smoker NaN 84 / 47 168 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
70070 6f1f8e83-4d29-15cb-e38b-2371c66bed8b 3323961 2015-02-14 14:30:00 2d903e5a-3701-141e-7e6e-78dd4a00cd65 642646 Davis CCN Vision Plan Former smoker NaN 84 / 47 190 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
70071 6f1f8e83-4d29-15cb-e38b-2371c66bed8b 3323961 2015-02-14 14:30:00 2d903e5a-3701-141e-7e6e-78dd4a00cd65 642646 Davis CCN Vision Plan Former smoker NaN NaN 168 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 41 columns

2. Unicode bug in Person_id of encounters and demographics


In [7]:
print('Person_ID column name of', table_names[1], ': ',dfs[1].columns.values[0])
print('Person_ID column name of', table_names[2], ': ',dfs[2].columns.values[0])
person_dict_correct={k:list(v) for k,v in 
                     dfs[0][['Person_ID','Person_Nbr']].drop_duplicates().groupby('Person_Nbr')['Person_ID']}
person_dict_demographics={k:list(v) for k,v in 
                          dfs[1].ix[:,0:2].drop_duplicates().groupby('Person_Nbr')[dfs[1].columns.values[0]]}
compare_demographics={k:v==person_dict_demographics[k] for k,v in 
                      person_dict_correct.items() if k in person_dict_demographics.keys()}
print('Look up Person_ID in all_encounter_data and demographics, and compare its related Person_Nbr to see if equal')
from collections import Counter
print('result is:')
print(Counter(compare_demographics.values()))

person_dict_encounters={k:list(v) for k,v in 
                          dfs[2].ix[:,0:2].drop_duplicates().groupby('Person_Nbr')[dfs[2].columns.values[0]]}
compare_encounters={k:v==person_dict_encounters[k] for k,v in 
                      person_dict_correct.items() if k in person_dict_encounters.keys()}
print('Look up Person_ID in all_encounter_data and encounters, and compare its related Person_Nbr to see if equal')
print('result is:')
print(Counter(compare_encounters.values()))

print('It indicates that the whole column of person id in two tables are wrong')


('Person_ID column name of', 'demographics', ': ', '\xef\xbb\xbfPerson_ID')
('Person_ID column name of', 'encounters', ': ', '\xef\xbb\xbfPerson_ID')
Look up Person_ID in all_encounter_data and demographics, and compare its related Person_Nbr to see if equal
result is:
Counter({False: 16087})
Look up Person_ID in all_encounter_data and encounters, and compare its related Person_Nbr to see if equal
result is:
Counter({False: 16087})
It indicates that the whole column of person id in two tables are wrong

3. Multiple Values of A1C, BP, BMI, and Glucose per Enc_ID


In [9]:
# Check all of the encounter ID's that show up multiple times in the table for where values are unique.

d_enc = dfs[0]
unique_key_count = {}

for key in d_enc[d_enc.duplicated(subset="Enc_ID")]["Enc_ID"].unique():
    table=d_enc[d_enc["Enc_ID"]==key]
    for column in list(d_enc):
        if table[column].unique().size != 1:
            unique_key_count[column] = unique_key_count.setdefault(column,0)+1
            #print d_enc[d_enc["Enc_ID"]==key].loc[:,["Enc_ID",column]]
unique_key_count


Out[9]:
{'A1C': 136, 'BMI': 280, 'BP': 991, 'Glucose': 260}

4. Glucose Contains BP Measurements


In [22]:
import re

pattern = re.compile("\d+\s*\/\s*\d+")
dfs[0][dfs[0]['Glucose'].str.contains(pattern, na=False)].head()


Out[22]:
Enc_ID Enc_Nbr Enc_Date Person_ID Person_Nbr Primary_Payer Smoking_Status BMI BP Glucose ... CYCLO_OD_SPH CYCLO_OD_CYL CYCLO_OD_AXIS CYCLO_OD_DVA CYCLO_OD_NVA CYCLO_OS_SPH CYCLO_OS_CYL CYCLO_OS_AXIS CYCLO_OS_DVA CYCLO_OS_NVA
11230 8e40e970-f373-6331-80cb-beed755c62a8 2682098 2014-04-20 02:15:00 31e96813-e876-c8d1-bfce-99b23e93943e 12980 Adv Cigna Health Spring HMO Smoker current status unknown NaN 160 / 100 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
11231 7a3a342f-b558-f7d9-b7a7-b7542f1c2790 1320272 2015-07-06 21:30:00 31e96813-e876-c8d1-bfce-99b23e93943e 12980 Adv Cigna Health Spring HMO Smoker current status unknown NaN 135/ 80 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
46729 8fbd4328-a243-1d87-90ed-6c88a4ec9bde 14365451 2012-03-14 15:45:00 c0fc026c-3aa5-7d9d-39cf-e282a562b797 148299 Adv AHC Wellcare Smoker current status unknown 33.14 140 / 85 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
46735 9c59ee1a-23c1-97e4-dcc8-a8905582677a 7335450 2013-09-16 15:00:00 c0fc026c-3aa5-7d9d-39cf-e282a562b797 148299 Adv AHC Wellcare Smoker current status unknown 32.08 170 / 72 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
80145 9b77cea6-6901-81a5-4559-878d46e52308 9938118 2015-12-18 08:15:00 99e74f8c-b02d-753a-2986-de1e0ff7c453 154404 Opticare IlliniCare ICP Smoker current status unknown NaN 93/ 63 ... NaN UTT NaN NaN NaN NaN +3.50 -4.50 123 NaN

5 rows × 41 columns