In [ ]:
import rdflib
import sqlite3 as sq
import functools
import numpy as np
import ipyparallel as ipp
import os
import re
import pprint
from textblob import TextBlob
import functools
import itertools
import pickle
import collections
import gzip
In [ ]:
GRABRADLEX = False
GRABREPORTS = False
GRABMARKUPS = False
In [ ]:
RADDIR = os.path.join(os.path.expanduser('~'),'Bdrive','Radiology')
os.path.exists(RADDIR)
NLPDIR = os.path.join(os.path.expanduser('~'),'Dropbox','NLP')
os.path.exists(RADDIR)
In [ ]:
rc = ipp.Client()
dview = rc[:]
rc.ids
In [ ]:
with dview.sync_imports():
from textblob import TextBlob
import numpy
#import tagObjects
import pyConTextNLP
g=rdflib.Graph() g.load(os.path.join(RADDIR,"Radlex_3.12.owl"))
qres = g.query("""SELECT DISTINCT ?type WHERE {?s a ?type.}""")
In creating the regular expressions from the RadLex terms, I've learned from sad (and slow) experience the need to have the word boundaries around the phrases.
In [ ]:
def create_regex(r1,r2):
if r2:
return r"\b%s\b|\b%s\b"%(r1,r2)
else:
return r"\b%s\b"%r1
In [ ]:
if GRABRADLEX:
g = rdflib.Graph()
result = g.parse("http://data.bioontology.org/ontologies/RADLEX/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=rdf",
format="xml")
query_results = {}
types = ["anatomy_metaclass","pathophysiology_metaclass","imaging_observation_metaclass"]
q1 = """
PREFIX Ontology1447432460: <http://www.owl-ontologies.com/Ontology1447432460.owl#>
SELECT DISTINCT ?s ?preferred_name ?synonym ?type
WHERE {
?s Ontology1447432460:Preferred_name ?preferred_name.
?s rdf:type ?type
OPTIONAL {?s Ontology1447432460:Synonym ?synonym.}
FILTER regex(str(?type), "%s")}"""
#print(q1)
for t in types:
q = q1%t
print(q)
query_results[t] = g.query(q)
for t,r in query_results.items():
print(t,len(r))
print("*"*42)
rslts = g.query(q)
itemData = {}
for t in types:
itemData[t] = [CI.create_ConTextItem([r[1],
r[3].split("#")[1],
create_regex(r[1],r[2]),
""]) \
for r in query_results[t]]
with open(os.path.join(RADDIR,"NLP","DBs","radlex.pickle"),"wb") as f0:
pickle.dump(itemData,f0)
else:
with open(os.path.join(RADDIR,"NLP","DBs","radlex.pickle"),"rb") as f0:
itemData = pickle.load(f0)
count = 0 for r in query_results["pathophysiology_metaclass"]: print(r[1],r[2]) count += 1 if count >=50: break
In [ ]:
for i in itemData['pathophysiology_metaclass'][0:50]:
print("LITERAL: %s; RE: %s"%(i.literal, i.re))
tmp = [r for r in query_results["pathophysiology_metaclass"] \ if "short bowel" in r[1]] tmp[0][2].toPython()
In [ ]:
import pyConTextNLP.functional.itemData as ID
import pyConTextNLP.functional.conTextItem as CI
import pyConTextNLP.functional.ConTextMarkup as CM
In [ ]:
def create_sentence_conTextMarkup(s, targets ):
markup = CM.create_ConTextMarkup()
markup = CM.setRawText(markup, s)
markup = CM.cleanText(markup)
markup = CM.mark_items_in_text(markup, targets, mode="target")
markup = CM.pruneMarks(markup)
markup = CM.dropMarks(markup, category='Exclusion')
# apply modifiers to any targets within the modifiers scope
return markup
In [ ]:
def create_sentence_conTextMarkup_parallel(sentences, targets, mode):
mark_items_in_text = functools.partial(CM.mark_items_in_text,
items=targets,
mode=mode)
dropMarks = functools.partial(CM.dropMarks, category='Exclusion')
sm = [CM.create_ConTextMarkup() for i in range(len(sentences))]
sm = dview.map_async(CM.setRawText, sm, sentences).get()
sm = dview.map_async(CM.cleanText,sm).get()
sm = dview.map_async(mark_items_in_text,sm).get()
sm = dview.map_async(CM.pruneMarks,sm).get()
sm = dview.map_async(dropMarks,sm).get()
return sm
# apply modifiers to any targets within the modifiers scope
return markup
In [ ]:
all_items = []
for key,items in itemData.items():
all_items.extend(items)
In [ ]:
len(all_items)
In [ ]:
if GRABREPORTS:
conn = sq.connect(os.path.join(RADDIR,"NLP","DBs",
"criticalFindingsAll.sqlite"))
cursor = conn.cursor()
r_date = re.compile(r"""([0-9]{1,2}(/[0-9]{1,2})?/[0-9]{2,4})""")
r_time = re.compile(r"""([0-9]{1,2}:\d\d(pm|am)?)""")
cursor.execute("""SELECT rowid,impression from reports""")
data = [(d[0],d[1].lower()) for d in cursor.fetchall()]
data =[(d[0],r_date.sub("",r_time.sub("",d[1]))) for d in data]
reports = [d[1] for d in data]
def report2sentences(report):
return TextBlob(report).raw_sentences
sentences = list(itertools.chain.from_iterable(dview.map_async(report2sentences,reports).get()))
In [ ]:
if GRABMARKUPS:
allMarkups = create_sentence_conTextMarkup_parallel(sentences,
all_items,
mode="target")
detected_all = [p for p in allMarkups if p.nodes()]
print(len(detected_all))
import gzip
with gzip.open(os.path.join(RADDIR,"NLP","DBs","radlex_found_sentences.pickle"),"wb") as f0:
pickle.dump(detected_all,f0)
else:
with gzip.open(os.path.join(RADDIR,"NLP","DBs","radlex_found_sentences.pickle"),"rb") as f0:
detected_all = pickle.load(f0)
pathoMarkups = create_sentence_conTextMarkup_parallel(sentences, itemData['pathophysiology_metaclass'], mode="target")
detected_patho = [p for p in pathoMarkups if p.nodes()] print(len(detected_patho))
anatMarkups = create_sentence_conTextMarkup_parallel(sentences, itemData['anatomy_metaclass'], mode="target") detected_anat = [p for p in anatMarkups if p.nodes()] print(len(detected_anat))
In [ ]:
for i in range(20):
print(detected_all[i].nodes())
print(detected_all[i].graph["__text"])
In [ ]:
def analyzeReport(report, targets ):
"""
given an individual radiology report, creates a pyConTextGraph
object that contains the context markup
report: a text string containing the radiology reports
"""
sentences = report2sentences(report)
markups = list(map(create_sentence_conTextMarkup,sentences, report, targets,[]))
return markups
In [ ]:
c2 = sq.connect(os.path.join(RADDIR,"radlex_terms_expanded.sqlite"))
cu2 = c2.cursor()