In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
# for configuring connection
from configobj import ConfigObj
import os
%matplotlib inline
In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'
# connection info
conn_info = dict()
if os.path.isfile(config):
config = ConfigObj(config)
conn_info["sqluser"] = config['username']
conn_info["sqlpass"] = config['password']
conn_info["sqlhost"] = config['host']
conn_info["sqlport"] = config['port']
conn_info["dbname"] = config['dbname']
conn_info["schema_name"] = config['schema_name']
else:
conn_info["sqluser"] = 'postgres'
conn_info["sqlpass"] = ''
conn_info["sqlhost"] = 'localhost'
conn_info["sqlport"] = 5432
conn_info["dbname"] = 'eicu'
conn_info["schema_name"] = 'public,eicu_crd'
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
# try connecting without password, i.e. peer or OS authentication
try:
if (conn_info["sqlhost"] == 'localhost') & (conn_info["sqlport"]=='5432'):
con = psycopg2.connect(dbname=conn_info["dbname"],
user=conn_info["sqluser"])
else:
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"])
except:
conn_info["sqlpass"] = getpass.getpass('Password: ')
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"],
password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'
In [3]:
query = query_schema + """
with cpc as ( select distinct patientunitstayid from careplancareprovider )
, cpeol as ( select distinct patientunitstayid from careplaneol )
, cpge as ( select distinct patientunitstayid from careplangeneral )
, cpgo as ( select distinct patientunitstayid from careplangoal )
, cpid as ( select distinct patientunitstayid from careplaninfectiousdisease )
select
pt.hospitalid
, count(distinct pt.patientunitstayid) as number_of_patients
, count(distinct cpc.patientunitstayid) as n_cpc
, count(distinct cpeol.patientunitstayid) as n_cpeol
, count(distinct cpge.patientunitstayid) as n_cpge
, count(distinct cpgo.patientunitstayid) as n_cpgo
, count(distinct cpid.patientunitstayid) as n_cpid
from patient pt
left join cpc
on pt.patientunitstayid = cpc.patientunitstayid
left join cpeol
on pt.patientunitstayid = cpeol.patientunitstayid
left join cpge
on pt.patientunitstayid = cpge.patientunitstayid
left join cpgo
on pt.patientunitstayid = cpgo.patientunitstayid
left join cpid
on pt.patientunitstayid = cpid.patientunitstayid
group by pt.hospitalid
""".format()
df = pd.read_sql_query(query, con)
# convert to percent
cols = ['n_cpc', 'n_cpeol', 'n_cpge', 'n_cpgo', 'n_cpid']
for c in cols:
df[c] = df[c].astype(float) / df['number_of_patients'] * 100.0
df.sort_values('number_of_patients', ascending=False, inplace=True)
df.head(n=10)
Out[3]:
In [5]:
# rename columns within the call for clarity in the figure
column_label = {'n_cpc': 'Care provider',
'n_cpeol': 'EOL',
'n_cpge': 'General',
'n_cpgo': 'Goal',
'n_cpid': 'Infectious disease'}
df[cols].rename(columns=column_label).vgplot.hist(bins=10, stacked=True,
var_name='Number of hospitals',
value_name='Percent of patients with data')
As we can see, the majority of hospitals do not have data for the goal, infectious disease, and EOL tables. Conversely, the care provider and general tables have good coverage.