In this notebook we demonstrate a basic document level classification of reports with respect to a single finding ( fever). We leverage the convenience of Pandas to read our data from a SQLite database and then use Pandas to add our classification as a new column in the dataframe.
Many of the common pyConTextNLP tasks have been wrapped into functions contained in the radnlp
pacakge. We important multiple modules that will allow us to write concise code.
In [1]:
import pyConTextNLP.pyConTextGraph as pyConText
import pyConTextNLP.itemData as itemData
import pymysql
import numpy as np
import os
import radnlp.io as rio
import radnlp.view as rview
import radnlp.rules as rules
import radnlp.schema as schema
import radnlp.utils as utils
import radnlp.split as split
import radnlp.classifier as classifier
import pandas as pd
from IPython.display import clear_output, display, HTML, Image
from IPython.html.widgets import interact, interactive, fixed
from IPython.display import clear_output
import ipywidgets as widgets
from radnlp.data import classrslts
import networkx as nx
conn = pymysql.connect(host="mysql",
port=3306,user="jovyan",
passwd='jovyan',db='mimic2')
In [2]:
conn = pymysql.connect(host="mysql",
port=3306,user="jovyan",
passwd='jovyan',db='mimic2')
In [63]:
colors={"pulmonary_embolism":"blue",
"pneumonia":"blue",
"pneumothorax":"blue",
"diverticulitis":"blue",
"definite_negated_existence":"red",
"probable_negated_existence":"indianred",
"ambivalent_existence":"orange",
"probable_existence":"forestgreen",
"definite_existence":"green",
"historical":"goldenrod",
"indication":"Pink",
"acute":"golden"}
In [4]:
import radnlp
radnlp.__version__
Out[4]:
In [51]:
def getOptions():
"""Generates arguments for specifying database and other parameters"""
options = {}
options['lexical_kb'] = ["https://raw.githubusercontent.com/chapmanbe/pyConTextNLP/master/KB/lexical_kb_05042016.tsv"]
options['domain_kb'] = ["https://raw.githubusercontent.com/chapmanbe/pyConTextNLP/master/KB/critical_findings.tsv"]
options["schema"] = "https://raw.githubusercontent.com/chapmanbe/pyConTextNLP/master/KB/schema2.csv"
options["rules"] = "https://raw.githubusercontent.com/chapmanbe/pyConTextNLP/master/KB/classificationRules3.csv"
return options
In [52]:
def get_kb_rules_schema(options):
"""
Get the relevant kb, rules, and schema.
"""
_radnlp_rules = rules.read_rules(options["rules"])
_schema = schema.read_schema(options["schema"])
modifiers = itemData.itemData()
targets = itemData.itemData()
for kb in options['lexical_kb']:
modifiers.extend( itemData.instantiateFromCSVtoitemData(kb) )
for kb in options['domain_kb']:
targets.extend( itemData.instantiateFromCSVtoitemData(kb) )
return {"rules":_radnlp_rules,
"schema":_schema,
"modifiers":modifiers,
"targets":targets}
In [53]:
def analyze_report(report, modifiers, targets, rules, schema):
"""
given an individual radiology report, creates a pyConTextGraph
object that contains the context markup
report: a text string containing the radiology reports
"""
markup = utils.mark_report(split.get_sentences(report),
modifiers,
targets)
clssfy = classifier.classify_document_targets(markup,
rules[0],
rules[1],
rules[2],
schema)
return classrslts(context_document=markup, exam_type="ctpa", report_text=report, classification_result=clssfy)
In [55]:
options = getOptions()
kb = get_kb_rules_schema(options)
#data = data.dropna()
In [56]:
data = \
pd.read_sql("""SELECT noteevents.subject_id,
noteevents.text,
icd9.code
FROM noteevents INNER JOIN icd9 ON
noteevents.subject_id = icd9.subject_id
WHERE ( icd9.code LIKE '415.1%'
)
AND noteevents.category = 'RADIOLOGY_REPORT'""",
conn).drop_duplicates()
data.head(10)
Out[56]:
In [57]:
doc_split=["IMPRESSION:", "INTERPRETATION:", "CONCLUSION:"]
def find_impression(text, split):
for term in split:
if term in text:
return text.split(term)[1]
return np.NaN
data["impression"] = data.apply(lambda row: find_impression(row["text"], doc_split), axis=1)
data = data.dropna(axis=0, inplace=False)
data["chest"] = data.apply(lambda x: 'chest' in x["text"].lower() and 'ct' in x["text"].lower(), axis=1)
data = data[data["chest"] == True]
data = data.reset_index()
print(data.shape)
data.head()
Out[57]:
We now need to apply our schema to the reports. Since our data is in a Pandas data frame, the easiest way to process our reports is with the DataFrame apply
method.
lambda
to create an anonymous function which basically just applies analyze_report
to the "impression"
column with the modifiers, targets, etc. that we have read in separately.analyze_report
returns a dictionary with keys
as any identified targets defined in the "targets"
file and values as a tuple with values:
In [58]:
data["pe rslt"] = \
data.apply(lambda x: analyze_report(x["impression"],
kb["modifiers"],
kb["targets"],
kb["rules"],
kb["schema"]), axis=1)
data.head()
Out[58]:
In [59]:
def view_markup(reports, colors):
@interact(i=widgets.IntSlider(min=0, max=len(reports)-1))
def _view_markup(i):
markup = reports["pe rslt"][i]
rview.markup_to_pydot(markup)
display(Image("tmp.png"))
mt = rview.markup_to_html(markup, color_map=colors)
display(HTML(mt))
In [64]:
view_markup(data, colors)
In [ ]: