In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
# for configuring connection
from configobj import ConfigObj
import os
%matplotlib inline
In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'
# connection info
conn_info = dict()
if os.path.isfile(config):
config = ConfigObj(config)
conn_info["sqluser"] = config['username']
conn_info["sqlpass"] = config['password']
conn_info["sqlhost"] = config['host']
conn_info["sqlport"] = config['port']
conn_info["dbname"] = config['dbname']
conn_info["schema_name"] = config['schema_name']
else:
conn_info["sqluser"] = 'postgres'
conn_info["sqlpass"] = ''
conn_info["sqlhost"] = 'localhost'
conn_info["sqlport"] = 5432
conn_info["dbname"] = 'eicu'
conn_info["schema_name"] = 'public,eicu_crd'
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
# try connecting without password, i.e. peer or OS authentication
try:
if (conn_info["sqlhost"] == 'localhost') & (conn_info["sqlport"]=='5432'):
con = psycopg2.connect(dbname=conn_info["dbname"],
user=conn_info["sqluser"])
else:
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"])
except:
conn_info["sqlpass"] = getpass.getpass('Password: ')
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"],
password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'
In [4]:
patientunitstayid = 242380
In [11]:
query = query_schema + """
select *
from intakeoutput
where patientunitstayid = {}
order by intakeoutputoffset
""".format(patientunitstayid)
df = pd.read_sql_query(query, con)
df.head()
Out[11]:
Above we can see that the type of data recorded is described by the cellpath
. cellpath
is hierarchical, with pipes (|
) separating hierarchies. As expected, most data here will fall under the I&O hierarchy. We can see the patient body weight is recorded in both pounds (lbs) and kilograms (kg). The patient's urine output is also documented.
In [13]:
df_uo = df.loc[df['celllabel'].str.contains('Urine'), :].copy()
df_uo['uo'] = pd.to_numeric(df_uo['cellvaluenumeric'], errors='coerce')
df_uo['uo'] = df_uo['uo'].cumsum()
cols = ['uo']
df_uo.set_index('intakeoutputoffset')[cols].vgplot()
In [14]:
cols = ['intaketotal', 'outputtotal', 'dialysistotal', 'nettotal']
df.set_index('intakeoutputoffset')[cols].vgplot()
Of course, it is unlikely that the patient has good urine output for almost 20 hours with no corresponding fluid intake - and even less likely that urine output is the only factor affecting patient fluid balance and likely the nettotal
column is a naive aggregation of information documented in the intakeOutput table. Indeed, we can see from the infusionDrug table that the patient is receiving both heparin and nitroglycerin, which should be factored in as inputs when calculating patient fluid balance.
In [8]:
query = query_schema + """
select
pt.hospitalid
, count(distinct pt.patientunitstayid) as number_of_patients
, count(distinct a.patientunitstayid) as number_of_patients_with_tbl
from patient pt
left join intakeoutput a
on pt.patientunitstayid = a.patientunitstayid
group by pt.hospitalid
""".format(patientunitstayid)
df = pd.read_sql_query(query, con)
df['data completion'] = df['number_of_patients_with_tbl'] / df['number_of_patients'] * 100.0
df.sort_values('number_of_patients_with_tbl', ascending=False, inplace=True)
df.head(n=10)
Out[8]:
In [9]:
df[['data completion']].vgplot.hist(bins=10,
var_name='Number of hospitals',
value_name='Percent of patients with data')
Above we can see that around 10 hospitals have very few to no patients documented in the intakeOutput table (left side of histogram, 0-10% bin), while over 120 hospitals have 90-100% of patients with data in intakeOutput (right side of histogram, 0-90% bin).