In [1]:
import os, sys
sys.path.append('..')
import fcFinder as fc
from fcFinder import markup_conditions, modifiers, targets
import helpers
import pyConTextNLP as pyConText
from pyConTextNLP.pyConTextGraph import ConTextMarkup

About this notebook

This notebook offers a walk-through of the fcFinder package. fcFinder offers tools that utilize the pyConText library and are geared towards the discovery of fluid collections in radiology reports of surgical patients. fcFinder will eventually be enveloped in a larger tool called HAI-Detect, which expands this functionality to pneumonia, UTIs, and other forms of surgical site infections.

Before using fcFinder, it is suggested that the user understand the pyConTextNLP library. Notebooks are avilable for download at https://github.com/chapmanbe/pyConTextNLP.

What fcFinder offers

fcFinder is meant to offer tools that utilize some basic functionalities of pyConText for an end user. It is specifically geared towards the identification of fluid collections, but can be expanded to other uses. For an example of a use case, see the PDF contained in this folder.

Targets and Modifiers

In pyConText, targets and modifiers are explicilty defined using .tsv files. In this example, we will use these default targets and modifiers files:

'../targets.tsv'
'../modifiers.tsv'

, that define the following classes:

  • Targets:

    • FLUID_COLLECTION
    • PSEUDOANCHOR - terms that could be caught as false positives and are explicitly differentiated from the targets
  • Modifiers:

    • ANATOMY - anatomical locations that allow the user to identify targets that occur only in the same sentence as certain locations
    • INDICATION, DEFINITE_NEGATED_EXISTENCE, PROBABLE_EXISTENCE, AMBIVALENT_EXISTENCE, etc. - lexical modifiers
    • PSEUDOANATOMY, PSEUDONEG, PSEUDOINDICATION

Demo

Here is a demonstration of how fcFinder was used in the use case scenario.


In [2]:
file = 'sample_report.txt' #Sample Radiology report from MIMIC-III dataset
report = ''
with open(file,'r') as f:
    report += f.read()

In [3]:
print(report[:1000])


 
 CT ABD & PELVIS WITH CONTRAST 
 ~ Reason: Evaluate for abscess/fluid collection and drain position.
 ~ Admitting Diagnosis: CHOLECYSTITIS
  Contrast: OPTIRAY Amt: 130
 

 ~ UNDERLYING MEDICAL CONDITION:
  82M s/p open CCY and biloma drainage with  drains in place--now draining
  purulent fluid.
 ~ REASON FOR THIS EXAMINATION:
  Evaluate for abscess/fluid collection and drain position.
 No contraindications for IV contrast
 

 ~ WET READ: MLHh 
  Multiloculated communicating fluid collections in GB fossa (w/ Surgicel),
  perihepatic space, and R abd/pelvis.  Some of these have characteristics
  concerning for hematoma/superinfx.
  R rectus abdominis edema and small IM fluid collections, cannot r/o infx.
  Malpositioned  drains, which course through collections, but tips end
  outside at hepatic dome and inferior liver tip.
  Bilat small pleural effusions, R>L.
 WET READ VERSION #1 MLHh 
  Multiloculated fluid collections in GB fossa, perihepatic space, and R
  abd/pelvis, with charac

In [4]:
# Here is a pipeline provided in the fcFinder module
# A custom function can be made by the user
# This section will walk through this process
def fc_pipeline(report, preprocess=lambda x:x.lower(), 
                splitter=helpers.my_sentence_splitter):
    report = preprocess(report) 
    sentences = splitter(report) 
    markups = create_list_of_markups(sentences,spans=True) #Create markups and set markup_conditions
    markups = [m for m in markups if m.markupClass] #Exclude any markups that aren't assigned a class
    return markups

In [5]:
#preprocess 
# Any preprocessing done for report. By default is all lowercase.
preprocess = lambda x:x.lower()
print(preprocess(report)[:300])


 
 ct abd & pelvis with contrast 
 ~ reason: evaluate for abscess/fluid collection and drain position.
 ~ admitting diagnosis: cholecystitis
  contrast: optiray amt: 130
 

 ~ underlying medical condition:
  82m s/p open ccy and biloma drainage with  drains in place--now draining
  purulent fluid.
 

In [6]:
# A function that will split sentences.
# Default is to split on '.'
# This example uses a custom-made function found in ../helpers.py
# This function returns named tuples that contain the text and span of each sentence
# You can also just return a list of sentences without keeping track of the sentence spans in the document.

splitter = helpers.my_sentence_splitter
sentences = splitter(report)
for named_tup in sentences[:5]:
    print(sentences.index(named_tup))
    print(named_tup)
    print()


0
SentenceSpanPair(text=' \n CT ABD & PELVIS WITH CONTRAST \n ~', span=(0, 36))

1
SentenceSpanPair(text='Reason: Evaluate for abscess/fluid collection and drain position.', span=(37, 102))

2
SentenceSpanPair(text=' ~', span=(103, 105))

3
SentenceSpanPair(text='Admitting Diagnosis: CHOLECYSTITIS\n  Contrast: OPTIRAY Amt: 130\n \n\n ~', span=(106, 175))

4
SentenceSpanPair(text='UNDERLYING MEDICAL CONDITION:\n  82M s/p open CCY and biloma drainage with  drains in place--now draining\n  purulent fluid.', span=(176, 298))


In [7]:
# We'll create a markup using this SentenceSpanPair

s = sentences[8]
print(s)


SentenceSpanPair(text='WET READ: MLHh \n  Multiloculated communicating fluid collections in GB fossa (w/ Surgicel),\n  perihepatic space, and R abd/pelvis.', span=(435, 565))

In [8]:
span = s.span
markup = ConTextMarkup()
markup.setRawText(s.text)
if not span: # optional for if you do not use a SentenceSpanPair
    span = (0,len(s))
markup.docSpan = span

#The following functionality is explained in pyConText's notebook
markup.cleanText() 
markup.markItems(modifiers, mode="modifier")
markup.markItems(targets, mode="target")
markup.pruneMarks()
markup.dropMarks('Exclusion')
markup.applyModifiers()
markup.pruneSelfModifyingRelationships()
markup.dropInactiveModifiers()

In [9]:
# To keep track of relevant data found by pyConText, we'll use a class called markup_conditions
markup.conditions = markup_conditions(markup) #add this as an attribute to the markup object
markup.target = markup.conditions.target
markup.target = markup.conditions.target
markup.modifiers = markup.conditions.modifiers #markup_classifier(markup)

print(markup.conditions.indication) # Check if it is modified by indication, negated_existence or anatomy
print(markup.conditions.negated)
print(markup.conditions.anatomy)
print(markup.conditions) # Print the object


False
False
True
['fluid_collection'] modified by: ['anatomy']

In [10]:
#Now that we have specified what information is relevant, we can classify the markup using this function.
def markup_classifier(markup):
    conditions = markup.conditions
    markup_class = None

    if not conditions.target:
        pass
    #positive
    elif (conditions.anatomy and not conditions.negated and not conditions.indication)\
        or (conditions.anatomy and conditions.definitive):
        markup_class = "Fluid collection-positive"
        
    #negated
    elif conditions.negated and not conditions.definitive:
        markup_class = "Fluid collection-negated"
    
    #indication
    elif conditions.indication and not (conditions.negated or conditions.definitive
                                or conditions.historical or conditions.probable):
        markup_class = "fluid collection-indication"
        
    #check for pseudoanatomy
    if conditions.pseudoanatomy and not conditions.anatomy:
        markup_class = None
    return markup_class

In [11]:
#The markup has the target with a category 'Fluid Collection' and is modified by 'anatomy'. 
#According to our function, that means it should be classified as 'Fluid collection-positive'
markup.markupClass = markup_classifier(markup)
print(markup.markupClass)


Fluid collection-positive

In [12]:
#Note that if it were not modified by anatomy, it would not be assigned a markupClass
markup2 = markup
markup2.conditions = markup_conditions(markup2)
markup2.conditions.anatomy = False
markup2.markupClass = markup_classifier(markup2)
print(markup2.markupClass)


None

In [13]:
#We've now classified our markup. The pipeline above applies this process to an entire report and returns a list of markups.
markups = fc.fc_pipeline(report,splitter=helpers.my_sentence_splitter,spans=True)
for m in markups[:5]:
    print(m.getRawText())
    print(m.markupClass)
    print()


reason: evaluate for abscess/fluid collection and drain position.
fluid collection-indication

underlying medical condition:
  82m s/p open ccy and biloma drainage with  drains in place--now draining
  purulent fluid.
fluid collection-indication

reason for this examination:
  evaluate for abscess/fluid collection and drain position.
fluid collection-indication

wet read: mlhh 
  multiloculated communicating fluid collections in gb fossa (w/ surgicel),
  perihepatic space, and r abd/pelvis.
Fluid collection-positive

  r rectus abdominis edema and small im fluid collections, cannot r/o infx.
Fluid collection-positive


In [ ]:

Document Classification

Once you've classified each sentence in the document, you can classify the document as a whole.

We'll categorize a document as 'present' if at least 1 markup is classified as positive


In [14]:
def classify_document(markups):
    if any([m.markupClass for m in markups if m.markupClass == 'Fluid collection-positive']):
        doc_class = 'Fluid collection-present'
    doc_class = 'Fluid collection-not present'
    return doc_class

classify_document(markups)


Out[14]:
'Fluid collection-not present'

Output

There are several options for output, which are defined in ./input.output.py . For the use case, we converted the markups into knowtator XML files that could be mapped into eHOST. This could then be used to adjudicate fcFinder's findings with a reference standard of manually annotated reports.

Other possibilities include .csv files, JSON files, or arrays with counts of the different classes of markup.


In [15]:
import input_output as io
import pandas as pd

annotations = [io.createAnnotation(m, file) for m in markups]
XMLstring = io.write_knowtator(annotations, file)
print(XMLstring[:1000])


<?xml version="1.0" ?>
<annotations textSource="sample_report.txt">
  <annotation>
    <mention id="258750673486544177623941054921840308712"/>
    <annotator id="eHOST_2010">FC_FINDER</annotator>
    <span end="102" start="37"/>
    <spannedText>reason: evaluate for abscess/fluid collection and drain position.</spannedText>
    <creationDate>Wed May 10 17:02:40 2017</creationDate>
  </annotation>
  <classMention id="258750673486544177623941054921840308712">
    <mentionClass id="fluid collection-indication">reason: evaluate for abscess/fluid collection and drain position.</mentionClass>
  </classMention>
  <annotation>
    <mention id="258768433271453375116854817241587823080"/>
    <annotator id="eHOST_2010">FC_FINDER</annotator>
    <span end="298" start="176"/>
    <spannedText>underlying medical condition: 82m s/p open ccy and biloma drainage with drains in place--now draining purulent fluid.</spannedText>
    <creationDate>Wed May 10 17:02:40 2017</creationDate>
  </annotation>
  <

In [16]:
with open('sample_reference_standard.knowtator.xml') as f: #manual-annotated results
    XMLreference = f.read()
print(XMLreference[:1000])


<?xml version="1.0" encoding="UTF-8"?>
<annotations textSource="Yes_74976_148937_02-28-66.txt">
    <annotation>
        <mention id="EHOST_Instance_67907" />
        <annotator id="eHOST_2010">REFERENCE</annotator>
        <span start="45" end="102" />
        <spannedText>Evaluate for abscess/fluid collection and drain position.</spannedText>
        <creationDate>Tue Feb 28 10:54:30 EST 2017</creationDate>
    </annotation>
    <classMention id="EHOST_Instance_67907">
        <mentionClass id="fluid collection-indication">Evaluate for abscess/fluid collection and drain position.</mentionClass>
    </classMention>
    <annotation>
        <mention id="EHOST_Instance_67908" />
        <annotator id="eHOST_2010">REFERENCE</annotator>
        <span start="333" end="390" />
        <spannedText>Evaluate for abscess/fluid collection and drain position.</spannedText>
        <creationDate>Tue Feb 28 10:54:37 EST 2017</creationDate>
    </annotation>
    <classMention id="EHOST_Instance_679

In [17]:
columns=['file_name','original_text','reference_standard','fcFinder_results']
data = [os.path.splitext(file)[0],report,XMLreference,XMLstring]
df = pd.DataFrame(columns=columns)
df = df.append(pd.DataFrame([data],columns=columns),ignore_index=True)
df


Out[17]:
file_name original_text reference_standard fcFinder_results
0 sample_report \n CT ABD & PELVIS WITH CONTRAST \n ~ Reason:... <?xml version="1.0" encoding="UTF-8"?>\n<annot... <?xml version="1.0" ?>\n<annotations textSourc...

In [18]:
#Similar functionality with JSON and .csv will be added at a future date

In [19]:
#Array with markupClass counts
io.fc_vectorizer(annotations,["Fluid collection-positive",'fluid collection-indication','Fluid collection-negated'])


Out[19]:
array([ 19.,   4.,   0.])

In [20]:
import numpy as np
def markups_vectorizer(markups,classes=["Fluid collection-positive",
                                        'fluid collection-indication','Fluid collection-negated']):
    arr = np.ravel(np.zeros((len(classes),1)))
    for i in range(len(classes)):
        for a in markups:
            if a.markupClass == classes[i]:
                arr[i] += 1
    return arr
print('positive, indication, negated')
print(markups_vectorizer(markups))


positive, indication, negated
[ 19.   4.   0.]

In [21]:
#You can use pandas to export to .xlsx, .csv, sqlite or .json
import pandas as pd
def markups_to_pandas(markups,file_name):
    columns=['file_name','text','span','class']
    df = pd.DataFrame(columns=columns)
    for m in markups:
        data = [file, m.getRawText(), m.docSpan, m.markupClass]
        row = pd.Series(data,columns)
        df = df.append(pd.DataFrame([row],columns=columns),ignore_index=True)
    return df

df = markups_to_pandas(markups,file)
df


Out[21]:
file_name text span class
0 sample_report.txt reason: evaluate for abscess/fluid collection ... (37, 102) fluid collection-indication
1 sample_report.txt underlying medical condition:\n 82m s/p open ... (176, 298) fluid collection-indication
2 sample_report.txt reason for this examination:\n evaluate for a... (302, 390) fluid collection-indication
3 sample_report.txt wet read: mlhh \n multiloculated communicatin... (435, 565) Fluid collection-positive
4 sample_report.txt r rectus abdominis edema and small im fluid ... (639, 714) Fluid collection-positive
5 sample_report.txt malpositioned drains, which course through ... (715, 836) Fluid collection-positive
6 sample_report.txt wet read version #1 mlhh \n multiloculated f... (875, 1035) Fluid collection-positive
7 sample_report.txt r rectus abdominis edema and small im fluid ... (1036, 1111) Fluid collection-positive
8 sample_report.txt indication: 82-year-old male with acute on ch... (1230, 1372) fluid collection-indication
9 sample_report.txt in the gallbladder fossa, there is a relative... (2256, 2514) Fluid collection-positive
10 sample_report.txt this appears to demonstrate faint\n periphera... (2515, 2670) Fluid collection-positive
11 sample_report.txt the subcapsular fluid\n collection surroundin... (2671, 2831) Fluid collection-positive
12 sample_report.txt this collection and a contiguous posterior co... (3009, 3185) Fluid collection-positive
13 sample_report.txt more inferiorly, a loculated 5 cm tv x -8 cm ... (3296, 3468) Fluid collection-positive
14 sample_report.txt a focal intramuscular fluid collection in the... (3709, 3830) Fluid collection-positive
15 sample_report.txt the lateral\n drain courses along the right l... (3996, 4240) Fluid collection-positive
16 sample_report.txt the medial drain courses anteriorly and super... (4241, 4443) Fluid collection-positive
17 sample_report.txt the liver parenchyma appears slightly heterog... (4444, 4554) Fluid collection-positive
18 sample_report.txt \n pelvis: the appendix is completely filled... (5274, 5460) Fluid collection-positive
19 sample_report.txt mild reactive thickening is also noted at the... (5461, 5592) Fluid collection-positive
20 sample_report.txt impression:\n - interval open cholecystectomy... (6535, 6764) Fluid collection-positive
21 sample_report.txt \n drains are located peripherally within som... (6765, 6941) Fluid collection-positive
22 sample_report.txt - small right rectus abdominis collection alo... (6942, 7061) Fluid collection-positive

In [22]:
#df.to_csv('example.csv')
#df.to_json('example.json')
#df.to_excel('example.xlsx')

In [ ]:

Example of how to expand

The use case and the above example is a very narrow application of pyConText. Future work should expand this functionality to other problems, such as pneumonia and UTIs. Here is an example of how the tools defined in fcFinder can be adapted to a broader domain of problems.

Here's what you'll need to do to make your own implementation:

- New targets and modifiers
- To use the markup_conditions.add_target() method
- A new classifying function

In [125]:
import pyConTextNLP.itemData as itemData
import pyConTextNLP.pyConTextGraph as pyConText
new_modifiers = itemData.instantiateFromCSVtoitemData(os.path.join(os.path.abspath('.'),'sample_modifiers.tsv'))
new_targets = itemData.instantiateFromCSVtoitemData(os.path.join(os.path.abspath('.'),'sample_targets.tsv'))

In [126]:
print(new_modifiers)
print(new_targets)


itemData: 3 items [can be ruled out, cannot see, at risk, ]
itemData: 3 items [pneumonia, fluid collection, urinary tract infection, ]

In [158]:
new_report = """Pneumonia can be ruled out. I cannot see any fluid collection in the abdomen.
The patient is at risk of developing a urinary tract infection."""

In [161]:
def new_markup_classifier(m):
    markup_class = None
    
    if not m.conditions.target:
        return None
    
    markup_target = m.conditions.target.getCategory()[0]
    if m.conditions.negated:
        lex = 'negated existence'
    elif m.conditions.future:
        lex = 'future existence'
    else:
        lex = 'definite existence'
    markup_class = '{target}-{lex}'.format(target=markup_target, lex=lex)
    return markup_class

def new_pipeline(report):
    report = report.lower()
    spans = False
    sentences = helpers.my_sentence_splitter(report)
    markups = fc.create_list_of_markups(sentences,modifiers=new_modifiers,targets=new_targets,spans=True)
    for m in markups:
        m.conditions = fc.markup_conditions(markup=m)
        for t in new_targets: #add new targets
            m.conditions.add_target(t.getCategory())
            m.target = m.conditions.target
        if m.isModifiedByCategory(m.conditions.target,'future_existence'): #Add new classes
            m.conditions.future = True
        else:
            m.conditions.future = False
        m.markupClass = new_markup_classifier(m)
    return [m for m in markups if m.markupClass]
markups = new_pipeline(new_report)

In [162]:
markups = new_pipeline(new_report)
for m in markups:
    print(m.getRawText())
    print(m.markupClass)
    print()


pneumonia can be ruled out.
pneumonia-negated existence

i cannot see any fluid collection in the abdomen.
fluid_collection-negated existence

the patient is at risk of developing a urinary tract infection.
uti-future existence


In [163]:
annotations = [io.createAnnotation(m, 'new_report') for m in markups]
XMLstring = io.write_knowtator(annotations, 'new_report')
print(XMLstring)


<?xml version="1.0" ?>
<annotations textSource="new_report">
  <annotation>
    <mention id="41417455595668713246398992713458954728"/>
    <annotator id="eHOST_2010">FC_FINDER</annotator>
    <span end="27" start="0"/>
    <spannedText>pneumonia can be ruled out.</spannedText>
    <creationDate>Wed May 10 18:02:38 2017</creationDate>
  </annotation>
  <classMention id="41417455595668713246398992713458954728">
    <mentionClass id="pneumonia-negated existence">pneumonia can be ruled out.</mentionClass>
  </classMention>
  <annotation>
    <mention id="41418223316563476470976038494056002024"/>
    <annotator id="eHOST_2010">FC_FINDER</annotator>
    <span end="77" start="28"/>
    <spannedText>i cannot see any fluid collection in the abdomen.</spannedText>
    <creationDate>Wed May 10 18:02:38 2017</creationDate>
  </annotation>
  <classMention id="41418223316563476470976038494056002024">
    <mentionClass id="fluid_collection-negated existence">i cannot see any fluid collection in the abdomen.</mentionClass>
  </classMention>
  <annotation>
    <mention id="41418789797925453458001412505563869672"/>
    <annotator id="eHOST_2010">FC_FINDER</annotator>
    <span end="141" start="78"/>
    <spannedText>the patient is at risk of developing a urinary tract infection.</spannedText>
    <creationDate>Wed May 10 18:02:38 2017</creationDate>
  </annotation>
  <classMention id="41418789797925453458001412505563869672">
    <mentionClass id="uti-future existence">the patient is at risk of developing a urinary tract infection.</mentionClass>
  </classMention>
  <eHOST_Adjudication_Status version="1.0">
    <Adjudication_Selected_Annotators version="1.0"/>
    <Adjudication_Selected_Classes version="1.0"/>
    <Adjudication_Others>
      <CHECK_OVERLAPPED_SPANS>false</CHECK_OVERLAPPED_SPANS>
      <CHECK_ATTRIBUTES>false</CHECK_ATTRIBUTES>
      <CHECK_RELATIONSHIP>false</CHECK_RELATIONSHIP>
      <CHECK_CLASS>false</CHECK_CLASS>
      <CHECK_COMMENT>false</CHECK_COMMENT>
    </Adjudication_Others>
  </eHOST_Adjudication_Status>
</annotations>


In [ ]: