In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
# for configuring connection
from configobj import ConfigObj
import os
%matplotlib inline
In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'
# connection info
conn_info = dict()
if os.path.isfile(config):
config = ConfigObj(config)
conn_info["sqluser"] = config['username']
conn_info["sqlpass"] = config['password']
conn_info["sqlhost"] = config['host']
conn_info["sqlport"] = config['port']
conn_info["dbname"] = config['dbname']
conn_info["schema_name"] = config['schema_name']
else:
conn_info["sqluser"] = 'postgres'
conn_info["sqlpass"] = ''
conn_info["sqlhost"] = 'localhost'
conn_info["sqlport"] = 5432
conn_info["dbname"] = 'eicu'
conn_info["schema_name"] = 'public,eicu_crd'
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
# try connecting without password, i.e. peer or OS authentication
try:
if (conn_info["sqlhost"] == 'localhost') & (conn_info["sqlport"]=='5432'):
con = psycopg2.connect(dbname=conn_info["dbname"],
user=conn_info["sqluser"])
else:
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"])
except:
conn_info["sqlpass"] = getpass.getpass('Password: ')
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"],
password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'
In [3]:
patientunitstayid = 2704494
In [4]:
query = query_schema + """
select *
from admissiondx
where patientunitstayid = {}
""".format(patientunitstayid)
df = pd.read_sql_query(query, con)
df.head()
Out[4]:
We can see there are only 3 records for this patient. Let's examine the admitdxpath
, admitdxname
, and admitdxtext
values in detail.
In [5]:
for i, row in df.iterrows():
print('Row {}'.format(i+1))
for c in ['admitdxpath', 'admitdxname', 'admitdxtext']:
print(' {}: {}'.format(c, row[c]))
We can see that these rows are providing us: (1) the APACHE diagnosis, (2) auxiliary admission diagnosis information. Row 1 informs us of the organ system (Hematology), row 2 informs us that the patient was not admitted to/from the operating room (O.R.), and row 3 gives us the full APACHE IV admission diagnosis (thrombocytopenia).
In [6]:
query = query_schema + """
select admitdxpath, count(*) as numobs
from admissiondx
group by admitdxpath
order by numobs desc
"""
df_grp = pd.read_sql_query(query, con)
for i, row in df_grp.head(n=10).iterrows():
print('{:6d} {}'.format(row['numobs'], row['admitdxpath']))
Here we can see the most popular information present in the table: admission from OR, admission for elective surgery, popular body systems, and popular diagnoses.
In [7]:
query = query_schema + """
with dx as
(
select distinct patientunitstayid from admissiondx
where admitdxpath like '%|All Diagnosis|%'
)
, apv as
(
select patientunitstayid
from apachepatientresult
where apacheversion = 'IVa'
and apachescore <> '-1'
)
select count(pt.patientunitstayid) as n
, count(dx.patientunitstayid) as n_with_dx
, count(apv.patientunitstayid) as n_with_apachepred
, sum(
case when apv.patientunitstayid is not null
and dx.patientunitstayid is null
then 1 else 0 end) as n_pred_with_no_dx
from patient pt
left join dx
on pt.patientunitstayid = dx.patientunitstayid
left join apv
on pt.patientunitstayid = apv.patientunitstayid
"""
df_ct = pd.read_sql_query(query, con)
df_ct
Out[7]:
Let's look for patients who were admitted with sepsis. First, we search for these diagnoses to manually curate them.
In [8]:
import re
# remove prefixes from admission dx strings for simplicity
def remove_prefix(text, prefix):
return re.sub(r'^{0}'.format(re.escape(prefix)), '', text)
In [9]:
dx = 'sepsis'
query = query_schema + """
select
admitdxpath
, count(*) as numobs
, max(case when lower(admitdxpath) like '%{}%' then 1 else 0 end) as valid_dx
from admissiondx
-- isolate to just diagnoses (not auxiliary info like elective admission)
where admitdxpath like '%|All Diagnosis|%'
group by admitdxpath
order by numobs desc
""".format(dx)
df_dx = pd.read_sql_query(query, con)
print('{} diagnoses in total.'.format(df_dx.shape[0]))
print('{} contain the dx string {}.'.format(df_dx['valid_dx'].sum(), dx))
# trim some of the text from the admitdxpath
df_dx['dx'] = df_dx['admitdxpath'].apply(lambda x: remove_prefix(x, 'admission diagnosis|All Diagnosis|'))
# add operative/non-operative
df_dx['op'] = df_dx['dx'].apply(lambda x: x[0:13] == 'Non-operative')
# strip operative/non-operative text
df_dx['dx'] = df_dx['dx'].apply(lambda x: remove_prefix(x, 'Non-operative|Diagnosis|'))
df_dx['dx'] = df_dx['dx'].apply(lambda x: remove_prefix(x, 'Operative|Diagnosis|'))
df_dx[['dx','numobs','valid_dx']].head()
Out[9]:
Since there are only 426 diagnoses, we can manually review and confirm that we cover all possible sepsis diagnoses. However, even still, keep in mind that these diagnoses are chosen according to the APACHE scoring criteria. For example, if a patient was admitted with sepsis secondary to a surgery, then the admission diagnosis would need to be the surgery they were originally hospitalized for. Thus we can consider these diagnoses specific, but not necessarily sensitive.
In [12]:
sepsis_dx = df_dx.loc[df_dx['valid_dx']==1, 'admitdxpath'].values
sepsis_dx = ["'" + x + "'" for x in sepsis_dx]
print('Using the following sepsis diagnoses:')
print('{}'.format('\n'.join(sepsis_dx)))
sepsis_dx = ','.join(sepsis_dx)
print()
query = query_schema + """
select
patientunitstayid, admitdxpath
from admissiondx
where admitdxpath in
(
{}
)
""".format(sepsis_dx)
df = pd.read_sql_query(query, con)
print('{} admissions found with an admission diagnosis of {}.'.format(df.shape[0], dx))