This notebook is to load tables that appear relevant to my project into 1 sqlite database for ease of querying


In [24]:
import csv
import os
import sys
import sqlite3


# the files to read
mimic_files = ('DIAGNOSES_ICD_DATA_TABLE.csv',
               'D_ICD_PROCEDURES_DATA_TABLE.csv',
               'PATIENTS_DATA_TABLE.csv',
               'D_ICD_DIAGNOSES_DATA_TABLE.csv', 
               'NOTEEVENTS_DATA_TABLE.csv',
               'DRGCODES_DATA_TABLE.csv',
               'D_CPT_DATA_TABLE.csv',
               'CPTEVENTS_DATA_TABLE.csv')

In [25]:
sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', 'mimic3.sqlite')
if (os.path.exists(sqlitedb)):
    print("Database already exists - proceed with caution!")
    sys.exit()

These next few cells create the required tables using sql from https://github.com/MIT-LCP/mimic-code/blob/master/buildmimic/postgres/postgres_create_tables.sql


In [26]:
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.executescript('''
    drop table if exists DIAGNOSES_ICD;
    CREATE TABLE DIAGNOSES_ICD
    (   ROW_ID INT NOT NULL,
        SUBJECT_ID INT NOT NULL,
        HADM_ID INT NOT NULL,
        SEQ_NUM INT,
        ICD9_CODE VARCHAR(20),
        CONSTRAINT diagnosesicd_rowid_pk PRIMARY KEY (ROW_ID)
    );''')

In [27]:
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.executescript('''
    drop table if exists D_ICD_PROCEDURES;
    CREATE TABLE D_ICD_PROCEDURES
    (   ROW_ID INT NOT NULL,
        ICD9_CODE VARCHAR(10) NOT NULL,
        SHORT_TITLE VARCHAR(50) NOT NULL,
        LONG_TITLE VARCHAR(255) NOT NULL,
        CONSTRAINT d_icd_proc_code_unique UNIQUE (ICD9_CODE),
        CONSTRAINT d_icd_proc_rowid_pk PRIMARY KEY (ROW_ID)
    );''')

In [28]:
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.executescript('''
    drop table if exists PATIENTS;
    CREATE TABLE PATIENTS
    (   ROW_ID INT NOT NULL,
        SUBJECT_ID INT NOT NULL,
        GENDER VARCHAR(5) NOT NULL,
        DOB TIMESTAMP(0) NOT NULL,
        DOD TIMESTAMP(0),
        DOD_HOSP TIMESTAMP(0),
        DOD_SSN TIMESTAMP(0),
        EXPIRE_FLAG INT NOT NULL,
        CONSTRAINT pat_subid_unique UNIQUE (SUBJECT_ID),
        CONSTRAINT pat_rowid_pk PRIMARY KEY (ROW_ID)
    );''')

In [29]:
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.executescript('''
    drop table if exists D_ICD_DIAGNOSES;
    CREATE TABLE D_ICD_DIAGNOSES
    (   ROW_ID INT NOT NULL,
        ICD9_CODE VARCHAR(10) NOT NULL,
        SHORT_TITLE VARCHAR(50) NOT NULL,
        LONG_TITLE VARCHAR(255) NOT NULL,
        CONSTRAINT d_icd_diag_code_unique UNIQUE (ICD9_CODE),
        CONSTRAINT d_icd_diag_rowid_pk PRIMARY KEY (ROW_ID)
    );''')

In [30]:
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.executescript('''
    drop table if exists NOTEEVENTS;
    CREATE TABLE NOTEEVENTS
    (   ROW_ID INT NOT NULL,
        SUBJECT_ID INT NOT NULL,
        HADM_ID INT,
        CHARTDATE TIMESTAMP(0),
        CHARTTIME TIMESTAMP(0),
        STORETIME TIMESTAMP(0),
        CATEGORY VARCHAR(50),
        DESCRIPTION VARCHAR(255),
        CGID INT,
        ISERROR CHAR(1),
        TEXT TEXT,
        CONSTRAINT noteevents_rowid_pk PRIMARY KEY (ROW_ID)
    );''')

In [31]:
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.executescript('''
    drop table if exists DRGCODES;
    CREATE TABLE DRGCODES
    (   ROW_ID INT NOT NULL,
        SUBJECT_ID INT NOT NULL,
        HADM_ID INT NOT NULL,
        DRG_TYPE VARCHAR(20) NOT NULL,
        DRG_CODE VARCHAR(20) NOT NULL,
        DESCRIPTION VARCHAR(255),
        DRG_SEVERITY SMALLINT,
        DRG_MORTALITY SMALLINT,
        CONSTRAINT drg_rowid_pk PRIMARY KEY (ROW_ID)
    );''')

In [32]:
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.executescript('''
    drop table if exists D_CPT;
    CREATE TABLE D_CPT
    (   ROW_ID INT NOT NULL,
        CATEGORY SMALLINT NOT NULL,
        SECTIONRANGE VARCHAR(100) NOT NULL,
        SECTIONHEADER VARCHAR(50) NOT NULL,
        SUBSECTIONRANGE VARCHAR(100) NOT NULL,
        SUBSECTIONHEADER VARCHAR(255) NOT NULL,
        CODESUFFIX VARCHAR(5),
        MINCODEINSUBSECTION INT NOT NULL,
        MAXCODEINSUBSECTION INT NOT NULL,
        CONSTRAINT dcpt_ssrange_unique UNIQUE (SUBSECTIONRANGE),
        CONSTRAINT dcpt_rowid_pk PRIMARY KEY (ROW_ID)
    );''')

In [33]:
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.executescript('''
    drop table if exists CPTEVENTS;
    CREATE TABLE CPTEVENTS
    (   ROW_ID INT NOT NULL,
        SUBJECT_ID INT NOT NULL,
        HADM_ID INT NOT NULL,
        COSTCENTER VARCHAR(10) NOT NULL,
        CHARTDATE TIMESTAMP(0),
        CPT_CD VARCHAR(10) NOT NULL,
        CPT_NUMBER INT,
        CPT_SUFFIX VARCHAR(5),
        TICKET_ID_SEQ INT,
        SECTIONHEADER VARCHAR(50),
        SUBSECTIONHEADER VARCHAR(255),
        DESCRIPTION VARCHAR(200),
        CONSTRAINT cpt_rowid_pk PRIMARY KEY (ROW_ID)
    );''')

In [34]:
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.execute('select * from sqlite_master')
    row = cursor.fetchone()
    while row:
        print(row)
        row = cursor.fetchone()


('table', 'DIAGNOSES_ICD', 'DIAGNOSES_ICD', 2, 'CREATE TABLE DIAGNOSES_ICD\n    (   ROW_ID INT NOT NULL,\n        SUBJECT_ID INT NOT NULL,\n        HADM_ID INT NOT NULL,\n        SEQ_NUM INT,\n        ICD9_CODE VARCHAR(20),\n        CONSTRAINT diagnosesicd_rowid_pk PRIMARY KEY (ROW_ID)\n    )')
('index', 'sqlite_autoindex_DIAGNOSES_ICD_1', 'DIAGNOSES_ICD', 3, None)
('table', 'D_ICD_PROCEDURES', 'D_ICD_PROCEDURES', 4, 'CREATE TABLE D_ICD_PROCEDURES\n    (   ROW_ID INT NOT NULL,\n        ICD9_CODE VARCHAR(10) NOT NULL,\n        SHORT_TITLE VARCHAR(50) NOT NULL,\n        LONG_TITLE VARCHAR(255) NOT NULL,\n        CONSTRAINT d_icd_proc_code_unique UNIQUE (ICD9_CODE),\n        CONSTRAINT d_icd_proc_rowid_pk PRIMARY KEY (ROW_ID)\n    )')
('index', 'sqlite_autoindex_D_ICD_PROCEDURES_1', 'D_ICD_PROCEDURES', 5, None)
('index', 'sqlite_autoindex_D_ICD_PROCEDURES_2', 'D_ICD_PROCEDURES', 6, None)
('table', 'PATIENTS', 'PATIENTS', 7, 'CREATE TABLE PATIENTS\n    (   ROW_ID INT NOT NULL,\n        SUBJECT_ID INT NOT NULL,\n        GENDER VARCHAR(5) NOT NULL,\n        DOB TIMESTAMP(0) NOT NULL,\n        DOD TIMESTAMP(0),\n        DOD_HOSP TIMESTAMP(0),\n        DOD_SSN TIMESTAMP(0),\n        EXPIRE_FLAG INT NOT NULL,\n        CONSTRAINT pat_subid_unique UNIQUE (SUBJECT_ID),\n        CONSTRAINT pat_rowid_pk PRIMARY KEY (ROW_ID)\n    )')
('index', 'sqlite_autoindex_PATIENTS_1', 'PATIENTS', 8, None)
('index', 'sqlite_autoindex_PATIENTS_2', 'PATIENTS', 9, None)
('table', 'D_ICD_DIAGNOSES', 'D_ICD_DIAGNOSES', 12, 'CREATE TABLE D_ICD_DIAGNOSES\n    (   ROW_ID INT NOT NULL,\n        ICD9_CODE VARCHAR(10) NOT NULL,\n        SHORT_TITLE VARCHAR(50) NOT NULL,\n        LONG_TITLE VARCHAR(255) NOT NULL,\n        CONSTRAINT d_icd_diag_code_unique UNIQUE (ICD9_CODE),\n        CONSTRAINT d_icd_diag_rowid_pk PRIMARY KEY (ROW_ID)\n    )')
('index', 'sqlite_autoindex_D_ICD_DIAGNOSES_1', 'D_ICD_DIAGNOSES', 13, None)
('index', 'sqlite_autoindex_D_ICD_DIAGNOSES_2', 'D_ICD_DIAGNOSES', 14, None)
('table', 'NOTEEVENTS', 'NOTEEVENTS', 15, 'CREATE TABLE NOTEEVENTS\n    (   ROW_ID INT NOT NULL,\n        SUBJECT_ID INT NOT NULL,\n        HADM_ID INT,\n        CHARTDATE TIMESTAMP(0),\n        CHARTTIME TIMESTAMP(0),\n        STORETIME TIMESTAMP(0),\n        CATEGORY VARCHAR(50),\n        DESCRIPTION VARCHAR(255),\n        CGID INT,\n        ISERROR CHAR(1),\n        TEXT TEXT,\n        CONSTRAINT noteevents_rowid_pk PRIMARY KEY (ROW_ID)\n    )')
('index', 'sqlite_autoindex_NOTEEVENTS_1', 'NOTEEVENTS', 17, None)
('table', 'DRGCODES', 'DRGCODES', 18, 'CREATE TABLE DRGCODES\n    (   ROW_ID INT NOT NULL,\n        SUBJECT_ID INT NOT NULL,\n        HADM_ID INT NOT NULL,\n        DRG_TYPE VARCHAR(20) NOT NULL,\n        DRG_CODE VARCHAR(20) NOT NULL,\n        DESCRIPTION VARCHAR(255),\n        DRG_SEVERITY SMALLINT,\n        DRG_MORTALITY SMALLINT,\n        CONSTRAINT drg_rowid_pk PRIMARY KEY (ROW_ID)\n    )')
('index', 'sqlite_autoindex_DRGCODES_1', 'DRGCODES', 19, None)
('table', 'D_CPT', 'D_CPT', 20, 'CREATE TABLE D_CPT\n    (   ROW_ID INT NOT NULL,\n        CATEGORY SMALLINT NOT NULL,\n        SECTIONRANGE VARCHAR(100) NOT NULL,\n        SECTIONHEADER VARCHAR(50) NOT NULL,\n        SUBSECTIONRANGE VARCHAR(100) NOT NULL,\n        SUBSECTIONHEADER VARCHAR(255) NOT NULL,\n        CODESUFFIX VARCHAR(5),\n        MINCODEINSUBSECTION INT NOT NULL,\n        MAXCODEINSUBSECTION INT NOT NULL,\n        CONSTRAINT dcpt_ssrange_unique UNIQUE (SUBSECTIONRANGE),\n        CONSTRAINT dcpt_rowid_pk PRIMARY KEY (ROW_ID)\n    )')
('index', 'sqlite_autoindex_D_CPT_1', 'D_CPT', 21, None)
('index', 'sqlite_autoindex_D_CPT_2', 'D_CPT', 22, None)
('table', 'CPTEVENTS', 'CPTEVENTS', 24, 'CREATE TABLE CPTEVENTS\n    (   ROW_ID INT NOT NULL,\n        SUBJECT_ID INT NOT NULL,\n        HADM_ID INT NOT NULL,\n        COSTCENTER VARCHAR(10) NOT NULL,\n        CHARTDATE TIMESTAMP(0),\n        CPT_CD VARCHAR(10) NOT NULL,\n        CPT_NUMBER INT,\n        CPT_SUFFIX VARCHAR(5),\n        TICKET_ID_SEQ INT,\n        SECTIONHEADER VARCHAR(50),\n        SUBSECTIONHEADER VARCHAR(255),\n        DESCRIPTION VARCHAR(200),\n        CONSTRAINT cpt_rowid_pk PRIMARY KEY (ROW_ID)\n    )')
('index', 'sqlite_autoindex_CPTEVENTS_1', 'CPTEVENTS', 25, None)

In [35]:
for mf in mimic_files:
    file = os.path.join(os.path.expanduser('~'), 'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3', mf)

    if not (os.path.exists(file)):
        print("Specified file does not exist")
        sys.exit()

    csvReader = csv.reader(open(file, newline=''))
    header = next(csvReader)
    print('Columns read from ', mf, ':', header)

    table_name = mf.replace('_DATA_TABLE.csv', '')
    print('Loading to ', table_name)
    
    value_placeholder = ('?,'*len(header))[:-1]
    
    ## load each line from CSV into appropriate table
    connection = sqlite3.connect(sqlitedb)
    with connection:
        for row in csvReader:
            cursor = connection.cursor()
            cursor.execute('insert into ' + table_name + ' values (' + value_placeholder + ')', row)


Columns read from  DIAGNOSES_ICD_DATA_TABLE.csv : ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE']
Loading to  DIAGNOSES_ICD
Columns read from  D_ICD_PROCEDURES_DATA_TABLE.csv : ['ROW_ID', 'ICD9_CODE', 'SHORT_TITLE', 'LONG_TITLE']
Loading to  D_ICD_PROCEDURES
Columns read from  PATIENTS_DATA_TABLE.csv : ['ROW_ID', 'SUBJECT_ID', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN', 'EXPIRE_FLAG']
Loading to  PATIENTS
Columns read from  D_ICD_DIAGNOSES_DATA_TABLE.csv : ['ROW_ID', 'ICD9_CODE', 'SHORT_TITLE', 'LONG_TITLE']
Loading to  D_ICD_DIAGNOSES
Columns read from  NOTEEVENTS_DATA_TABLE.csv : ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT']
Loading to  NOTEEVENTS
Columns read from  DRGCODES_DATA_TABLE.csv : ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'DRG_TYPE', 'DRG_CODE', 'DESCRIPTION', 'DRG_SEVERITY', 'DRG_MORTALITY']
Loading to  DRGCODES
Columns read from  D_CPT_DATA_TABLE.csv : ['ROW_ID', 'CATEGORY', 'SECTIONRANGE', 'SECTIONHEADER', 'SUBSECTIONRANGE', 'SUBSECTIONHEADER', 'CODESUFFIX', 'MINCODEINSUBSECTION', 'MAXCODEINSUBSECTION']
Loading to  D_CPT
Columns read from  CPTEVENTS_DATA_TABLE.csv : ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'COSTCENTER', 'CHARTDATE', 'CPT_CD', 'CPT_NUMBER', 'CPT_SUFFIX', 'TICKET_ID_SEQ', 'SECTIONHEADER', 'SUBSECTIONHEADER', 'DESCRIPTION']
Loading to  CPTEVENTS

In [36]:
# now that all the tables are created and data loaded - create indexes
# Got these indexes from 
#    https://github.com/MIT-LCP/mimic-code/blob/master/buildmimic/postgres/postgres_add_indexes.sql
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.executescript('''
    drop index IF EXISTS NOTEEVENTS_idx01;
    CREATE INDEX NOTEEVENTS_idx01
        ON NOTEEVENTS (SUBJECT_ID);
    drop index IF EXISTS NOTEEVENTS_idx02;
    CREATE INDEX NOTEEVENTS_idx02
        ON NOTEEVENTS (HADM_ID);
    drop index IF EXISTS NOTEEVENTS_idx05;
    CREATE INDEX NOTEEVENTS_idx05
        ON NOTEEVENTS (CATEGORY);
    
    drop index IF EXISTS DIAGNOSES_ICD_idx01;
    CREATE INDEX DIAGNOSES_ICD_idx01
        ON DIAGNOSES_ICD (SUBJECT_ID);
    drop index IF EXISTS DIAGNOSES_ICD_idx02;
    CREATE INDEX DIAGNOSES_ICD_idx02
        ON DIAGNOSES_ICD (ICD9_CODE);
    drop index IF EXISTS DIAGNOSES_ICD_idx03;
    CREATE INDEX DIAGNOSES_ICD_idx03
        ON DIAGNOSES_ICD (HADM_ID);
        
    drop index IF EXISTS CPTEVENTS_idx01;
    CREATE INDEX CPTEVENTS_idx01
        ON CPTEVENTS (SUBJECT_ID);
    drop index IF EXISTS CPTEVENTS_idx02;
    CREATE INDEX CPTEVENTS_idx02
        ON CPTEVENTS (CPT_CD);
        
    drop index IF EXISTS DRGCODES_idx01;
    CREATE INDEX DRGCODES_idx01
        ON DRGCODES (SUBJECT_ID);
    drop index IF EXISTS DRGCODES_idx02;
    CREATE INDEX DRGCODES_idx02
        ON DRGCODES (DRG_CODE);
    drop index IF EXISTS DRGCODES_idx03;
    CREATE INDEX DRGCODES_idx03
        ON DRGCODES (DESCRIPTION);
    ''')

In [ ]: