The following columns are available:
We recommend configuring the config.ini
file to allow for connection to the database without specifying your password each time.
In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
# for configuring connection
from configobj import ConfigObj
import os
%matplotlib inline
In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'
# connection info
conn_info = dict()
if os.path.isfile(config):
config = ConfigObj(config)
conn_info["sqluser"] = config['username']
conn_info["sqlpass"] = config['password']
conn_info["sqlhost"] = config['host']
conn_info["sqlport"] = config['port']
conn_info["dbname"] = config['dbname']
conn_info["schema_name"] = config['schema_name']
else:
conn_info["sqluser"] = 'postgres'
conn_info["sqlpass"] = ''
conn_info["sqlhost"] = 'localhost'
conn_info["sqlport"] = 5432
conn_info["dbname"] = 'eicu'
conn_info["schema_name"] = 'public,eicu_crd'
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
# try connecting without password, i.e. peer or OS authentication
try:
if (conn_info["sqlhost"] == 'localhost') & (conn_info["sqlport"]=='5432'):
con = psycopg2.connect(dbname=conn_info["dbname"],
user=conn_info["sqluser"])
else:
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"])
except:
conn_info["sqlpass"] = getpass.getpass('Password: ')
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"],
password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'
In [3]:
patientunitstayid = 2704494
In [4]:
query = query_schema + """
select *
from admissiondrug
where patientunitstayid = {}
order by drugoffset
""".format(patientunitstayid)
df = pd.read_sql_query(query, con)
df.head()
Out[4]:
In [5]:
# Look at a subset of columns
cols = ['admissiondrugid','patientunitstayid','drugoffset','drugenteredoffset','drugname','drughiclseqno']
df[cols].head()
Out[5]:
Here we can see that these drugs were documented 2153 minutes (1.5 days) after ICU admission, but administered 87132 minutes (60 days) before ICU admission (thus, the negative offset). Since it's reasonable to assume the patient is still taking the drug (as this is the admissiondrug table), drugoffset
can likely be treated as a start time for a prescription of the drug.
Let's look for patients who were admitted on Zaroxolyn.
In [6]:
drug = 'ZAROXOLYN'
query = query_schema + """
select
admissiondrugid, patientunitstayid
, drugoffset, drugenteredoffset
, drugname, drughiclseqno
from admissiondrug
where drugname = '{}'
""".format(drug)
df_drug = pd.read_sql_query(query, con)
df_drug.set_index('admissiondrugid',inplace=True)
print('{} unit stays with {}.'.format(df_drug['patientunitstayid'].nunique(), drug))
Instead of using the drug name, we could try to use the HICL code.
In [7]:
hicl = 3663
query = query_schema + """
select
admissiondrugid, patientunitstayid
, drugoffset, drugenteredoffset
, drugname, drughiclseqno
from admissiondrug
where drughiclseqno = {}
""".format(hicl)
df_hicl = pd.read_sql_query(query, con)
df_hicl.set_index('admissiondrugid',inplace=True)
print('{} unit stays with HICL = {}.'.format(df_hicl['patientunitstayid'].nunique(), hicl))
As we can see, using the HICL returned many more observations. Let's take a look at a few:
In [8]:
# rows in HICL which are *not* in the drug dataframe
idx = ~df_hicl.index.isin(df_drug.index)
# count the drug names
df_hicl.loc[idx, 'drugname'].value_counts()
Out[8]:
All the rows use the drug name "Metolazone". Metolazone is the generic name for the brand Zaroxolyn. This demonstrates the utility of using HICL codes to identify drugs - synonyms like these are very common and can be tedious to find.
In [9]:
query = query_schema + """
select
pt.hospitalid
, count(pt.patientunitstayid) as number_of_patients
, count(ad.patientunitstayid) as number_of_patients_with_admdrug
from patient pt
left join admissiondrug ad
on pt.patientunitstayid = ad.patientunitstayid
group by pt.hospitalid
""".format(patientunitstayid)
df = pd.read_sql_query(query, con)
df['data completion'] = df['number_of_patients_with_admdrug'] / df['number_of_patients'] * 100.0
df.sort_values('number_of_patients_with_admdrug', ascending=False, inplace=True)
df.head(n=10)
Out[9]:
In [10]:
df[['data completion']].vgplot.hist(bins=10,
var_name='Number of hospitals',
value_name='Percent of patients with data')
As is common in eICU-CRD, there are a subset of hospitals who routinely utilize this portion of the medical record (and thus have 90-100% data completion), while there are other hospitals who rarely use this interface and thus have poor data completion (0-10%).