In [1]:
import os, sys
sys.path.append('..')
import fcFinder as fc
from fcFinder import markup_conditions, modifiers, targets
import helpers
import pyConTextNLP as pyConText
from pyConTextNLP.pyConTextGraph import ConTextMarkup
This notebook offers a walk-through of the fcFinder package. fcFinder offers tools that utilize the pyConText library and are geared towards the discovery of fluid collections in radiology reports of surgical patients. fcFinder will eventually be enveloped in a larger tool called HAI-Detect, which expands this functionality to pneumonia, UTIs, and other forms of surgical site infections.
Before using fcFinder, it is suggested that the user understand the pyConTextNLP library. Notebooks are avilable for download at https://github.com/chapmanbe/pyConTextNLP.
fcFinder is meant to offer tools that utilize some basic functionalities of pyConText for an end user. It is specifically geared towards the identification of fluid collections, but can be expanded to other uses. For an example of a use case, see the PDF contained in this folder.
In pyConText, targets and modifiers are explicilty defined using .tsv files. In this example, we will use these default targets and modifiers files:
'../targets.tsv'
'../modifiers.tsv'
, that define the following classes:
Targets:
FLUID_COLLECTIONPSEUDOANCHOR - terms that could be caught as false positives and are explicitly differentiated from the targetsModifiers:
ANATOMY - anatomical locations that allow the user to identify targets that occur only in the same sentence as certain locationsINDICATION, DEFINITE_NEGATED_EXISTENCE, PROBABLE_EXISTENCE, AMBIVALENT_EXISTENCE, etc. - lexical modifiersPSEUDOANATOMY, PSEUDONEG, PSEUDOINDICATION
In [2]:
file = 'sample_report.txt' #Sample Radiology report from MIMIC-III dataset
report = ''
with open(file,'r') as f:
report += f.read()
In [3]:
print(report[:1000])
In [4]:
# Here is a pipeline provided in the fcFinder module
# A custom function can be made by the user
# This section will walk through this process
def fc_pipeline(report, preprocess=lambda x:x.lower(),
splitter=helpers.my_sentence_splitter):
report = preprocess(report)
sentences = splitter(report)
markups = create_list_of_markups(sentences,spans=True) #Create markups and set markup_conditions
markups = [m for m in markups if m.markupClass] #Exclude any markups that aren't assigned a class
return markups
In [5]:
#preprocess
# Any preprocessing done for report. By default is all lowercase.
preprocess = lambda x:x.lower()
print(preprocess(report)[:300])
In [6]:
# A function that will split sentences.
# Default is to split on '.'
# This example uses a custom-made function found in ../helpers.py
# This function returns named tuples that contain the text and span of each sentence
# You can also just return a list of sentences without keeping track of the sentence spans in the document.
splitter = helpers.my_sentence_splitter
sentences = splitter(report)
for named_tup in sentences[:5]:
print(sentences.index(named_tup))
print(named_tup)
print()
In [7]:
# We'll create a markup using this SentenceSpanPair
s = sentences[8]
print(s)
In [8]:
span = s.span
markup = ConTextMarkup()
markup.setRawText(s.text)
if not span: # optional for if you do not use a SentenceSpanPair
span = (0,len(s))
markup.docSpan = span
#The following functionality is explained in pyConText's notebook
markup.cleanText()
markup.markItems(modifiers, mode="modifier")
markup.markItems(targets, mode="target")
markup.pruneMarks()
markup.dropMarks('Exclusion')
markup.applyModifiers()
markup.pruneSelfModifyingRelationships()
markup.dropInactiveModifiers()
In [9]:
# To keep track of relevant data found by pyConText, we'll use a class called markup_conditions
markup.conditions = markup_conditions(markup) #add this as an attribute to the markup object
markup.target = markup.conditions.target
markup.target = markup.conditions.target
markup.modifiers = markup.conditions.modifiers #markup_classifier(markup)
print(markup.conditions.indication) # Check if it is modified by indication, negated_existence or anatomy
print(markup.conditions.negated)
print(markup.conditions.anatomy)
print(markup.conditions) # Print the object
In [10]:
#Now that we have specified what information is relevant, we can classify the markup using this function.
def markup_classifier(markup):
conditions = markup.conditions
markup_class = None
if not conditions.target:
pass
#positive
elif (conditions.anatomy and not conditions.negated and not conditions.indication)\
or (conditions.anatomy and conditions.definitive):
markup_class = "Fluid collection-positive"
#negated
elif conditions.negated and not conditions.definitive:
markup_class = "Fluid collection-negated"
#indication
elif conditions.indication and not (conditions.negated or conditions.definitive
or conditions.historical or conditions.probable):
markup_class = "fluid collection-indication"
#check for pseudoanatomy
if conditions.pseudoanatomy and not conditions.anatomy:
markup_class = None
return markup_class
In [11]:
#The markup has the target with a category 'Fluid Collection' and is modified by 'anatomy'.
#According to our function, that means it should be classified as 'Fluid collection-positive'
markup.markupClass = markup_classifier(markup)
print(markup.markupClass)
In [12]:
#Note that if it were not modified by anatomy, it would not be assigned a markupClass
markup2 = markup
markup2.conditions = markup_conditions(markup2)
markup2.conditions.anatomy = False
markup2.markupClass = markup_classifier(markup2)
print(markup2.markupClass)
In [13]:
#We've now classified our markup. The pipeline above applies this process to an entire report and returns a list of markups.
markups = fc.fc_pipeline(report,splitter=helpers.my_sentence_splitter,spans=True)
for m in markups[:5]:
print(m.getRawText())
print(m.markupClass)
print()
In [ ]:
In [14]:
def classify_document(markups):
if any([m.markupClass for m in markups if m.markupClass == 'Fluid collection-positive']):
doc_class = 'Fluid collection-present'
doc_class = 'Fluid collection-not present'
return doc_class
classify_document(markups)
Out[14]:
There are several options for output, which are defined in ./input.output.py . For the use case, we converted the markups into knowtator XML files that could be mapped into eHOST. This could then be used to adjudicate fcFinder's findings with a reference standard of manually annotated reports.
Other possibilities include .csv files, JSON files, or arrays with counts of the different classes of markup.
In [15]:
import input_output as io
import pandas as pd
annotations = [io.createAnnotation(m, file) for m in markups]
XMLstring = io.write_knowtator(annotations, file)
print(XMLstring[:1000])
In [16]:
with open('sample_reference_standard.knowtator.xml') as f: #manual-annotated results
XMLreference = f.read()
print(XMLreference[:1000])
In [17]:
columns=['file_name','original_text','reference_standard','fcFinder_results']
data = [os.path.splitext(file)[0],report,XMLreference,XMLstring]
df = pd.DataFrame(columns=columns)
df = df.append(pd.DataFrame([data],columns=columns),ignore_index=True)
df
Out[17]:
In [18]:
#Similar functionality with JSON and .csv will be added at a future date
In [19]:
#Array with markupClass counts
io.fc_vectorizer(annotations,["Fluid collection-positive",'fluid collection-indication','Fluid collection-negated'])
Out[19]:
In [20]:
import numpy as np
def markups_vectorizer(markups,classes=["Fluid collection-positive",
'fluid collection-indication','Fluid collection-negated']):
arr = np.ravel(np.zeros((len(classes),1)))
for i in range(len(classes)):
for a in markups:
if a.markupClass == classes[i]:
arr[i] += 1
return arr
print('positive, indication, negated')
print(markups_vectorizer(markups))
In [21]:
#You can use pandas to export to .xlsx, .csv, sqlite or .json
import pandas as pd
def markups_to_pandas(markups,file_name):
columns=['file_name','text','span','class']
df = pd.DataFrame(columns=columns)
for m in markups:
data = [file, m.getRawText(), m.docSpan, m.markupClass]
row = pd.Series(data,columns)
df = df.append(pd.DataFrame([row],columns=columns),ignore_index=True)
return df
df = markups_to_pandas(markups,file)
df
Out[21]:
In [22]:
#df.to_csv('example.csv')
#df.to_json('example.json')
#df.to_excel('example.xlsx')
In [ ]:
The use case and the above example is a very narrow application of pyConText. Future work should expand this functionality to other problems, such as pneumonia and UTIs. Here is an example of how the tools defined in fcFinder can be adapted to a broader domain of problems.
Here's what you'll need to do to make your own implementation:
- New targets and modifiers
- To use the markup_conditions.add_target() method
- A new classifying function
In [125]:
import pyConTextNLP.itemData as itemData
import pyConTextNLP.pyConTextGraph as pyConText
new_modifiers = itemData.instantiateFromCSVtoitemData(os.path.join(os.path.abspath('.'),'sample_modifiers.tsv'))
new_targets = itemData.instantiateFromCSVtoitemData(os.path.join(os.path.abspath('.'),'sample_targets.tsv'))
In [126]:
print(new_modifiers)
print(new_targets)
In [158]:
new_report = """Pneumonia can be ruled out. I cannot see any fluid collection in the abdomen.
The patient is at risk of developing a urinary tract infection."""
In [161]:
def new_markup_classifier(m):
markup_class = None
if not m.conditions.target:
return None
markup_target = m.conditions.target.getCategory()[0]
if m.conditions.negated:
lex = 'negated existence'
elif m.conditions.future:
lex = 'future existence'
else:
lex = 'definite existence'
markup_class = '{target}-{lex}'.format(target=markup_target, lex=lex)
return markup_class
def new_pipeline(report):
report = report.lower()
spans = False
sentences = helpers.my_sentence_splitter(report)
markups = fc.create_list_of_markups(sentences,modifiers=new_modifiers,targets=new_targets,spans=True)
for m in markups:
m.conditions = fc.markup_conditions(markup=m)
for t in new_targets: #add new targets
m.conditions.add_target(t.getCategory())
m.target = m.conditions.target
if m.isModifiedByCategory(m.conditions.target,'future_existence'): #Add new classes
m.conditions.future = True
else:
m.conditions.future = False
m.markupClass = new_markup_classifier(m)
return [m for m in markups if m.markupClass]
markups = new_pipeline(new_report)
In [162]:
markups = new_pipeline(new_report)
for m in markups:
print(m.getRawText())
print(m.markupClass)
print()
In [163]:
annotations = [io.createAnnotation(m, 'new_report') for m in markups]
XMLstring = io.write_knowtator(annotations, 'new_report')
print(XMLstring)
In [ ]: