In [ ]:
import sqlite3
import os
import pysolr
import re
import sys

sqlitedb = os.path.join(os.path.expanduser('~'), 'Box Sync', 'GradSchoolStuff',
                        'MastersProject', 'mimic3', 'mimic3.sqlite')
if not (os.path.exists(sqlitedb)):
    print("Specified database does not exist")
    sys.exit()

connection = sqlite3.connect(sqlitedb)

# Connect to Solr instance.
solr = pysolr.Solr('http://192.168.99.100:8983/solr/radiology')

with connection:
    cur = connection.cursor()
    recordset = cur.execute("""
        SELECT * FROM noteevents
        WHERE description LIKE '%CT%'
            AND description NOT LIKE '%ERECT%'
            AND description NOT LIKE '%PLCT%'
            AND category = 'Radiology'
        ORDER BY row_id
        limit 6000""")
    col_names = [cn[0] for cn in cur.description]

    # general match for section start
    #section_re = re.compile('^\s+([A-Z]+\s*){2,}:*')
    section_re = re.compile('^\s+([A-Z]+\s*){2,}(:|$)+')

    # patterns for reason section
    reason_1 = re.compile('^\s*REASON FOR THIS EXAMINATION:\s*$')
    reason_2 = re.compile('^\s*Reason:')

    # patterns for the impression section
    impression_1 = re.compile('.*(IMPRESSION|IMPRSSION|IMPRESSON|IMRESSION|Impression)(s:|S:|;|:|$)*')
    impression_2 = re.compile('^\s*(OVERALL)?\s*CONCLUSION[S]?[:]?')
    impression_3 = re.compile('^\s*S(UMMARY|ummary):?\s*')

    # pattern for findings section - only using this if none of the above impression
    # sections matched
    finding_1 = re.compile('.*\sFINDING(S)?(:)?')
    finding_2 = re.compile('.*\sPFI(:)?(\sREPORT)?')

    missing_count = 0
    found_count = 0
    total_count = 0
    finding_count = 0
    
    for row in recordset:
        print('evaluating: ', row[col_names.index("ROW_ID")])
        document = {}
        #print('++++++', col_names[idx], ': ', col)
        # want row_id, subject_id, hadm_id, description, text
        document['id'] = row[col_names.index("ROW_ID")]
        document['SUBJECT_ID_i'] = row[col_names.index("SUBJECT_ID")]
        # not all reports have a hospital admission id, so this may not be in Solr
        document['HADM_ID_i'] = row[col_names.index("HADM_ID")]
        # store description as a single string, but also allow for searching parts
        document['DESCRIPTION_s'] = row[col_names.index("DESCRIPTION")]
        document['DESCRIPTION_t'] = row[col_names.index("DESCRIPTION")]
        document['TEXT_t'] = row[col_names.index("TEXT")]

        """
        Out of 108,896 reports, only 4 don't use 'REASON FOR THIS EXAMINATION:'
        or 'Reason:' for the reason for exam section. By manual inspection, those 4
        reports are not ones related to PE nor PAH.

        Reports can have multiple reason sections. From inspection of the data,
          * ^'Reason:' (if it exists) is just a one line section and is a portion of the
            'REASON FOR THIS EXAMINATION:' section (if it exists)
          * ^'REASON FOR THIS EXAMINATION:' (if it exists) is a multi-line section and the actual
            reason starts on the next line. There are 3 cases where this section was repeated and
            in some of the repeated locations the reason may partially be on the same line.

        """
        reason_short = ''
        reason_long = ''
        if ('REASON FOR THIS EXAMINATION:' in row[col_names.index("TEXT")]
            or 'Reason:' in row[col_names.index("TEXT")]
           ):
            found_multiline = ''
            for line in row[col_names.index("TEXT")].rstrip().split('\n'):
                # multi-line reason section start -------------
                if reason_1.match(line) is not None:
                    found_multiline = True
                # single line reason section -------------
                elif reason_2.match(line) is not None:
                    reason_short = line.replace('Reason:', '').strip()
                # multi-line reason section continuation -------------
                elif found_multiline:
                    # should add to reason_long until end of section
                    if re.match('^\s\s\w*', line) is not None:
                        reason_long += line
                    else:
                        found_multiline = False
            if len(reason_long.strip()) > 0:
                document['REASON_t'] = reason_long.strip()
                #print(row[col_names.index("ROW_ID")], 'reason_long', reason_long.strip())
            else:
                document['REASON_t'] = reason_short
                #print(row[col_names.index("ROW_ID")], 'reason_short', reason_short)
        else:
            print('------------reason missing for', row[col_names.index("ROW_ID")])

        '''
        Identify impression section. Generally starts with 'IMPRESSION:' followed by newline, then text
        found some that have colon on next line...

        Found some have 'CONCLUSION' instead of 'IMPRESSION'

        Some of those that didn't have either IMPRESSION nor CONCLUSION had 'Summary'

        12 out of 500 selected by ordered row_id did not match the above sections - most of those have a
        findings section that appears to cover both the detailed radiological findings and communication
        to physician

        Some documents have a 'PROVISIONAL FINDINGS IMPRESSION (PFI):' section that we want to avoid.

        Found some reports that have 2 impression sections - an original and an updated one. Probably need both

        Some missing IMPRESSION and CONCLUSION, just have FINDINGS section. Some have 'FINDINGS' section
        then FINDINGS again when it should probably be IMPRESSION. Should that be a substitute? Only if
        none of the other 'IMPRESSION' like sections have been found. When looking for FINDINGS section,
        start at bottom and read up.
        '''
        found_impression = False
        found_summary = False
        found_conclusion = False
        found_findings = False
        skip_section = False
        if ('IMPRESSION' in row[col_names.index("TEXT")]
            or 'IMPRSSION' in row[col_names.index("TEXT")]
            or 'IMPRESSON' in row[col_names.index("TEXT")]
            or 'IMRESSION' in row[col_names.index("TEXT")]
            or 'Impression' in row[col_names.index("TEXT")]
            or 'CONCLUSION' in row[col_names.index("TEXT")]
            or 'SUMMARY' in row[col_names.index("TEXT")]
            or 'Summary' in row[col_names.index("TEXT")]
           ):
            found_count += 1
            found_match_per_record = 0
            section = ''
            found_section = False
            for line in row[col_names.index("TEXT")].rstrip().split('\n'):
                # it is possible that there are multiple 'impression' or impression-like sections
                # if there are multiple sections, keep all of them.
                if impression_1.match(line) is not None:
                    found_impression = True
                    found_match_per_record += 1
                    section += impression_1.sub('', line)
                    found_section = True
                    #print(row[col_names.index("ROW_ID")], line)
                    continue

                if impression_2.match(line) is not None:
                    found_conclusion = True
                    found_match_per_record += 1
                    section += impression_2.sub('', line)
                    found_section = True
                    #print(row[col_names.index("ROW_ID")], line)
                    continue

                if impression_3.match(line) is not None:
                    found_summary = True
                    found_match_per_record += 1
                    section += impression_3.sub('', line)
                    found_section = True
                    #print(row[col_names.index("ROW_ID")], line)
                    continue

                # should add to reason_long until end of section
                if found_section:
                    if '(Over)' in line:
                        skip_section = True
                        found_section = False
                    elif section_re.match(line) is None:
                        #print(row[col_names.index("ROW_ID")], 'continuing section: ', line)
                        section += line
                    else:
                        #print(row[col_names.index("ROW_ID")], 'ending section: ', line)
                        found_section = False

                # page breaks are normally wrapped in (Over)...(Cont)
                if skip_section:
                    if '(Cont)' in line:
                        skip_section = False
                        found_section = True

            document['IMPRESSION_t'] = section
            print(row[col_names.index("ROW_ID")], 'IMPRESSION')

            # this section for debugging purposes to look at documents with multiple impression-like
            # sections
            #if ((found_impression and (found_conclusion or found_summary)) or
            #(found_conclusion and (found_impression or found_summary)) or
            #(found_summary and (found_conclusion or found_impression))):
            #    print(row[col_names.index("ROW_ID")], 'has multiple matches.')
            #    print ('    section found:', section)


        ####
        #### need to identify and extract findings section...
        ####

        # Identify Findings section only if no impression like section found
        total_count += 1
        found_findings = False
        found_section = False
        skip_section = False
        section = ''
        if ('IMPRESSION_t' not in document and
            ('FINDING' in row[col_names.index("TEXT")] or
            'PFI' in row[col_names.index("TEXT")])
           ):
            for line in row[col_names.index("TEXT")].rstrip().split('\n'):
                if finding_1.match(line) is not None:
                    found_findings = True
                    section += finding_1.sub('', line)
                    found_section = True
                    #print(row[col_names.index("ROW_ID")], line)
                    continue
                elif finding_2.match(line) is not None:
                    found_findings = True
                    section += finding_2.sub('', line)
                    found_section = True
                    #print(row[col_names.index("ROW_ID")], line)
                    continue

                # should add to finding like section until end of section
                if found_section:
                    if '(Over)' in line:
                        skip_section = True
                        found_section = False
                    elif section_re.match(line) is None:
                        section += line
                    else:
                        found_section = False

                # page breaks are normally wrapped in (Over)...(Cont)
                if skip_section:
                    if '(Cont)' in line:
                        skip_section = False
                        found_section = True

            if found_findings == True:
                if ('IMPRESSION_t' in document):
                    print('THIS SHOULD NOT BE POSSIBLE')
                document['IMPRESSION_t'] = section
                print(row[col_names.index("ROW_ID")], 'FINDINGS')
                #print(row[col_names.index("ROW_ID")], 'section found:', section )
                finding_count += 1

        elif 'IMPRESSION_t' not in document:
            print(row[col_names.index("ROW_ID")], '------------no impression nor finding sections')
            _ = 1

        if (found_impression == False and
            found_conclusion == False and
            found_summary == False and
            found_findings == False
           ):
            missing_count += 1

    print('Found:', found_count)
    print('No impression but findings: ', finding_count)
    print('Missing:', missing_count)
    print('Total:', total_count)
        #print('Adding row_id', row[col_names.index("ROW_ID")], 'to Solr.')
        #solr.add([document])

print()

In [ ]:
#Now need to devide reports into sections...
# Or should this be done before importing into Solr???

# Right now I think that section segmentation should be done before solr insertion as the section
# can be inserted as another data field.
# So far, Reason and Impression sections are those that I think most relevant for looking for PE/PAH
# of course need to see how many reports actually have an impressions section.

'''
Now need to do some code cleanup.
* Move repeated code to functions
* Convert commented out print statements to logging statements that can be enabled when desired

Load data into Solr

Start figuring out Solr queries
'''

In [ ]:
#
# This code block checks for items missed in the impression section
#
import sqlite3
import os
import pysolr
import re

sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'mimic3',
                        'mimic3.sqlite')
if not (os.path.exists(sqlitedb)):
    print("Specified database does not exist")
    sys.exit()

connection = sqlite3.connect(sqlitedb)

# Connect to Solr instance.
solr = pysolr.Solr('http://192.168.99.100:8983/solr/radiology')

with connection:
    cur = connection.cursor()
    recordset = cur.execute("""
        SELECT * FROM noteevents
        WHERE description LIKE '%CT%'
            AND description NOT LIKE '%ERECT%'
            AND description NOT LIKE '%PLCT%'
            AND category = 'Radiology'
        ORDER BY row_id""")
    col_names = [cn[0] for cn in cur.description]
    
    #regular expressions for impression section
    impression_1 = re.compile('.*(IMPRESSION|IMPRSSION)')
    impression_2 = re.compile('^\s*(OVERALL)?\s*CONCLUSION[S]?[:]?')
    impression_3 = re.compile('^\s*S(UMMARY|ummary)\s*')
    impression_4 = re.compile('^\s*:\s*$')
    
    missing_count = 0
    found_count = 0
    count = 0
    total_count = 0
    
    for row in recordset:
        total_count += 1
        found_impression = False
        found_conclusion = False
        found_summary = False
        for line in row[col_names.index("TEXT")].rstrip().split('\n'):
            if impression_1.match(line) is not None:
                found_impression = True
            if impression_2.match(line) is not None:
                found_conclusion = True
            if impression_3.match(line) is not None:
                found_summary = True
        if found_impression == False and found_conclusion == False and found_summary == False:
            print(row[col_names.index("ROW_ID")], ' has no matches ----------')
            missing_count += 1
        else:
            found_count +=1

    print('Found:', found_count)
    print('Missing:', missing_count)
    print('Total:', total_count)