The hospital table is a very simple table that summarizes high level hospital information. Unlike other tables, it does not contain any patient identifiers, and instead can only be joined to the patient table using hospitalid
.
Note: many hospitals described in the hospital table have 0 patient admissions in the patient table.
In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import getpass
import pdvega
# for configuring connection
from configobj import ConfigObj
import os
%matplotlib inline
In [2]:
# Create a database connection using settings from config file
config='../db/config.ini'
# connection info
conn_info = dict()
if os.path.isfile(config):
config = ConfigObj(config)
conn_info["sqluser"] = config['username']
conn_info["sqlpass"] = config['password']
conn_info["sqlhost"] = config['host']
conn_info["sqlport"] = config['port']
conn_info["dbname"] = config['dbname']
conn_info["schema_name"] = config['schema_name']
else:
conn_info["sqluser"] = 'postgres'
conn_info["sqlpass"] = ''
conn_info["sqlhost"] = 'localhost'
conn_info["sqlport"] = 5432
conn_info["dbname"] = 'eicu'
conn_info["schema_name"] = 'public,eicu_crd'
# Connect to the eICU database
print('Database: {}'.format(conn_info['dbname']))
print('Username: {}'.format(conn_info["sqluser"]))
if conn_info["sqlpass"] == '':
# try connecting without password, i.e. peer or OS authentication
try:
if (conn_info["sqlhost"] == 'localhost') & (conn_info["sqlport"]=='5432'):
con = psycopg2.connect(dbname=conn_info["dbname"],
user=conn_info["sqluser"])
else:
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"])
except:
conn_info["sqlpass"] = getpass.getpass('Password: ')
con = psycopg2.connect(dbname=conn_info["dbname"],
host=conn_info["sqlhost"],
port=conn_info["sqlport"],
user=conn_info["sqluser"],
password=conn_info["sqlpass"])
query_schema = 'set search_path to ' + conn_info['schema_name'] + ';'
In [10]:
query = query_schema + """
with tt as
(
select hospitalid, count(*) as n
from patient
group by hospitalid
)
select h.*, coalesce(tt.n, 0) as n
from hospital h
left join tt
on h.hospitalid = tt.hospitalid
"""
df = pd.read_sql_query(query, con)
df.head()
Out[10]:
In [11]:
print('{} hospitals have 0 admissions.'.format(df.loc[df['n']==0,'hospitalid'].nunique()))
In [23]:
def count_with_percent(x, N):
return '{:3d} ({:5.2f}%)'.format(x.count(), x.count()*100.0/N)
In [25]:
for c in ['region','numbedscategory','teachingstatus','region']:
df[c].fillna('Missing',inplace=True)
In [29]:
idx = df['n']!=0
N = np.sum(idx)
for c in ['region','numbedscategory','teachingstatus']:
grp = df.loc[idx, :].groupby(c)['hospitalid']
print('')
print(grp.apply(count_with_percent, N))
Above we can see that the frequency of missing data ranges between 11-18% in the hospital table.