In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
# for configuring connection
from configobj import ConfigObj
import os
%matplotlib inline
In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'
# connection info
conn_info = dict()
if os.path.isfile(config):
config = ConfigObj(config)
conn_info["sqluser"] = config['username']
conn_info["sqlpass"] = config['password']
conn_info["sqlhost"] = config['host']
conn_info["sqlport"] = config['port']
conn_info["dbname"] = config['dbname']
conn_info["schema_name"] = config['schema_name']
else:
conn_info["sqluser"] = 'postgres'
conn_info["sqlpass"] = ''
conn_info["sqlhost"] = 'localhost'
conn_info["sqlport"] = 5432
conn_info["dbname"] = 'eicu'
conn_info["schema_name"] = 'public,eicu_crd'
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
# try connecting without password, i.e. peer or OS authentication
try:
if (conn_info["sqlhost"] == 'localhost') & (conn_info["sqlport"]=='5432'):
con = psycopg2.connect(dbname=conn_info["dbname"],
user=conn_info["sqluser"])
else:
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"])
except:
conn_info["sqlpass"] = getpass.getpass('Password: ')
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"],
password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'
In [3]:
patientunitstayid = 287822
In [4]:
query = query_schema + """
select *
from allergy
where patientunitstayid = {}
""".format(patientunitstayid)
df = pd.read_sql_query(query, con)
df.head()
Out[4]:
In [5]:
# Look at a subset of columns
cols = ['allergyid','patientunitstayid','allergyoffset','allergyenteredoffset',
'allergynotetype', 'usertype', 'writtenineicu',
'drugname','allergytype','allergyname']
df[cols].head()
Out[5]:
Here we can see that this patient had an admission note highlighting they were allergic to nickel, tetracycline, ampicillin, and clindamycin.
Let's look for patients who were allergic to tetracycline.
In [6]:
drug = 'Tetracycline'
query = query_schema + """
select
allergyid, patientunitstayid
, allergyoffset, allergyenteredoffset
, allergytype, allergyname
, drugname, drughiclseqno
from allergy
where allergyname = '{}'
""".format(drug)
df = pd.read_sql_query(query, con)
df.set_index('allergyid',inplace=True)
print('{} unit stays with allergy to {}.'.format(df['patientunitstayid'].nunique(), drug))
df.head()
Out[6]:
However, it's also possible that they used mixed case, pluralization, or specified more than that specific string. We can use a string comparison to look for allergies like tetracycline.
In [7]:
drug = 'Tetracycline'
query = query_schema + """
select
allergyid, patientunitstayid
, allergyoffset, allergyenteredoffset
, allergytype, allergyname
, drugname, drughiclseqno
from allergy
where lower(allergyname) like '%{}%'
""".format(drug.lower())
df = pd.read_sql_query(query, con)
df.set_index('allergyid',inplace=True)
print('{} unit stays with allergy to {}.'.format(df['patientunitstayid'].nunique(), drug))
df['allergyname'].value_counts()
Out[7]:
It's also possible that they specified the allergy under the drugname
column.
In [8]:
drug = 'Tetracycline'
query = query_schema + """
select
allergyid, patientunitstayid
, allergyoffset, allergyenteredoffset
, allergytype, allergyname
, drugname, drughiclseqno
from allergy
where lower(drugname) like '%{}%'
""".format(drug.lower())
df = pd.read_sql_query(query, con)
df.set_index('allergyid',inplace=True)
print('{} unit stays with allergy to {} specified in drugname.'.format(df['patientunitstayid'].nunique(), drug))
print(df['drugname'].value_counts())
df.head()
Out[8]:
Since we may not capture all spellings or brands for tetracycline, we can try to use the HICL code to identify other observations. Above, we can see the HICL for tetracycline is 5236.
In [9]:
hicl = 5236
query = query_schema + """
select
allergyid, patientunitstayid
, allergyoffset, allergyenteredoffset
, allergytype, allergyname
, drugname, drughiclseqno
from allergy
where drughiclseqno = {}
""".format(hicl)
df = pd.read_sql_query(query, con)
df.set_index('allergyid',inplace=True)
print('{} unit stays with allergy to HICL {} specified in drugname.'.format(df['patientunitstayid'].nunique(), hicl))
print(df['drugname'].value_counts())
df.head()
Out[9]:
Let's combine all these methods and summarize how many patients are identified using each method.
In [10]:
hicl = 5236
drugname = 'Tetracycline'
allergyname = 'Tetracycline'
query = query_schema + """
select
patientunitstayid
, max(case when drughiclseqno = {} then 1 else 0 end) as hicl_match
, max(case when lower(drugname) like '%{}%' then 1 else 0 end) as drug_match
, max(case when lower(allergyname) like '%{}%' then 1 else 0 end) as allergy_match
from allergy
group by patientunitstayid
""".format(hicl, drugname.lower(), allergyname.lower())
df = pd.read_sql_query(query, con)
# drop non-matching rows
idx = (df['hicl_match'] == 1) | (df['drug_match'] == 1) | (df['allergy_match'] == 1)
df = df.loc[idx, :]
df.groupby(['hicl_match', 'drug_match', 'allergy_match']).count()
Out[10]:
As we can see, using the allergyname
column was the most effective, and always identified patients allergic to tetracycline. Unfortunately we know this is an incomplete search, as likely providers will document brand names rather than generic names from time to time. For example, tetracyclin is marketed under the name Sumycin among others. We can search for Sumycin in the data:
In [11]:
drug = 'Sumycin'
query = query_schema + """
select
allergyid, patientunitstayid
, allergyoffset, allergyenteredoffset
, allergytype, allergyname
, drugname, drughiclseqno
from allergy
where lower(allergyname) like '%{}%'
""".format(drug.lower())
df = pd.read_sql_query(query, con)
df.set_index('allergyid',inplace=True)
print('{} unit stays with allergy to {}.'.format(df['patientunitstayid'].nunique(), drug))
df['allergyname'].value_counts()
Out[11]:
Happily, in this case, only 1 patient is excluded by not searching for Sumycin, but in general it may be more.
In [12]:
query = query_schema + """
select
pt.hospitalid
, count(distinct pt.patientunitstayid) as number_of_patients
, count(distinct a.patientunitstayid) as number_of_patients_with_tbl
from patient pt
left join allergy a
on pt.patientunitstayid = a.patientunitstayid
group by pt.hospitalid
""".format(patientunitstayid)
df = pd.read_sql_query(query, con)
df['data completion'] = df['number_of_patients_with_tbl'] / df['number_of_patients'] * 100.0
df.sort_values('number_of_patients_with_tbl', ascending=False, inplace=True)
df.head(n=10)
Out[12]:
In [13]:
df[['data completion']].vgplot.hist(bins=10,
var_name='Number of hospitals',
value_name='Percent of patients with data')
As is common in eICU-CRD, there are a subset of hospitals who routinely utilize this portion of the medical record (and thus have high data completion), while there are other hospitals who rarely use this interface and thus have poor data completion (0-10%). Data completion is conflated with the fact that not all patients will have an allergy, and thus it is expected that documentation not be >90%.