In [ ]:
import rdflib
import sqlite3 as sq
import functools
import numpy as np
import ipyparallel as ipp
import os
import re
import pprint
from textblob import TextBlob
import functools
import itertools
import pickle
import collections
import gzip
In [ ]:
GRABRADLEX = True
GRABREPORTS = True
GRABMARKUPS = False
In [ ]:
RADDIR = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'radlex')
os.path.exists(RADDIR)
print("RADDIR: ", RADDIR)
# this appears to not be used at all
#NLPDIR = os.path.join(os.path.expanduser('~'),'Dropbox','NLP')
#os.path.exists(NLPDIR)
#print("NLPDIR: ", NLPDIR)
In [ ]:
# before this block can be run, execute "ipcluster start -n 4" via commandline
# see more info at http://ipyparallel.readthedocs.org/en/stable/process.html
rc = ipp.Client()
dview = rc[:]
rc.ids
In [ ]:
with dview.sync_imports():
from textblob import TextBlob
import numpy
#import tagObjects
import pyConTextNLP
g=rdflib.Graph() g.load(os.path.join(RADDIR,"Radlex_3.12.owl"))
qres = g.query("""SELECT DISTINCT ?type WHERE {?s a ?type.}""")
In creating the regular expressions from the RadLex terms, I've learned from sad (and slow) experience the need to have the word boundaries around the phrases.
In [ ]:
def create_regex(r1,r2):
if r2:
return r"\b%s\b|\b%s\b"%(r1,r2)
else:
return r"\b%s\b"%r1
In [ ]:
print("Grabradlex: ", GRABRADLEX)
if GRABRADLEX:
g = rdflib.Graph()
#result = g.parse("http://data.bioontology.org/ontologies/RADLEX/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=rdf",
# format="xml")
result = g.parse(os.path.join(RADDIR, 'Radlex3.13.1.owl'), format="xml")
print("result: ", result)
In [ ]:
import pyConTextNLP.functional.itemData as ID
import pyConTextNLP.functional.conTextItem as CI
import pyConTextNLP.functional.ConTextMarkup as CM
In [ ]:
if GRABRADLEX:
g = rdflib.Graph()
#result = g.parse("http://data.bioontology.org/ontologies/RADLEX/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=rdf",
# format="xml")
result = g.parse(os.path.join(RADDIR, 'Radlex3.13.1.owl'), format="xml")
query_results = {}
types = ["anatomy_metaclass","pathophysiology_metaclass","imaging_observation_metaclass"]
q1 = """
PREFIX Ontology1447432460: <http://www.owl-ontologies.com/Ontology1447432460.owl#>
SELECT DISTINCT ?s ?preferred_name ?synonym ?type
WHERE {
?s Ontology1447432460:Preferred_name ?preferred_name.
?s rdf:type ?type
OPTIONAL {?s Ontology1447432460:Synonym ?synonym.}
FILTER regex(str(?type), "%s")}"""
#print(q1)
for t in types:
q = q1%t
print(q)
query_results[t] = g.query(q)
for t,r in query_results.items():
print(t,len(r))
print("*"*42)
rslts = g.query(q)
itemData = {}
for t in types:
itemData[t] = [CI.create_ConTextItem([r[1],
r[3].split("#")[1],
create_regex(r[1],r[2]),
""]) \
for r in query_results[t]]
with open(os.path.join(RADDIR,"DBs","radlex.pickle"),"wb") as f0:
pickle.dump(itemData,f0)
else:
with open(os.path.join(RADDIR,"DBs","radlex.pickle"),"rb") as f0:
itemData = pickle.load(f0)
count = 0 for r in query_results["pathophysiology_metaclass"]: print(r[1],r[2]) count += 1 if count >=50: break
In [ ]:
for i in itemData['pathophysiology_metaclass'][0:50]:
print(i)
#print("LITERAL: %s; RE: %s"%(i.literal, i.re))
tmp = [r for r in query_results["pathophysiology_metaclass"] \ if "short bowel" in r[1]] tmp[0][2].toPython()
In [ ]:
def create_sentence_conTextMarkup(s, targets ):
markup = CM.create_ConTextMarkup()
markup = CM.setRawText(markup, s)
markup = CM.cleanText(markup)
markup = CM.mark_items_in_text(markup, targets, mode="target")
markup = CM.pruneMarks(markup)
markup = CM.dropMarks(markup, category='Exclusion')
# apply modifiers to any targets within the modifiers scope
return markup
In [ ]:
def create_sentence_conTextMarkup_parallel(sentences, targets, mode):
mark_items_in_text = functools.partial(CM.mark_items_in_text,
items=targets,
mode=mode)
dropMarks = functools.partial(CM.dropMarks, category='Exclusion')
sm = [CM.create_ConTextMarkup() for i in range(len(sentences))]
sm = dview.map_async(CM.setRawText, sm, sentences).get()
sm = dview.map_async(CM.cleanText,sm).get()
sm = dview.map_async(mark_items_in_text,sm).get()
sm = dview.map_async(CM.pruneMarks,sm).get()
sm = dview.map_async(dropMarks,sm).get()
return sm
# apply modifiers to any targets within the modifiers scope
return markup
In [ ]:
all_items = []
for key,items in itemData.items():
all_items.extend(items)
In [ ]:
len(all_items)
In [ ]:
if GRABREPORTS:
conn = sq.connect(os.path.join(RADDIR,"DBs",
"criticalFindingsAll.sqlite"))
cursor = conn.cursor()
r_date = re.compile(r"""([0-9]{1,2}(/[0-9]{1,2})?/[0-9]{2,4})""")
r_time = re.compile(r"""([0-9]{1,2}:\d\d(pm|am)?)""")
cursor.execute("""SELECT rowid,impression from reports""")
data = [(d[0],d[1].lower()) for d in cursor.fetchall()]
data =[(d[0],r_date.sub("",r_time.sub("",d[1]))) for d in data]
reports = [d[1] for d in data]
def report2sentences(report):
return TextBlob(report).raw_sentences
sentences = list(itertools.chain.from_iterable(dview.map_async(report2sentences,reports).get()))
In [ ]:
if GRABMARKUPS:
allMarkups = create_sentence_conTextMarkup_parallel(sentences,
all_items,
mode="target")
detected_all = [p for p in allMarkups if p.nodes()]
print(len(detected_all))
import gzip
with gzip.open(os.path.join(RADDIR,"NLP","DBs","radlex_found_sentences.pickle"),"wb") as f0:
pickle.dump(detected_all,f0)
else:
with gzip.open(os.path.join(RADDIR,"NLP","DBs","radlex_found_sentences.pickle"),"rb") as f0:
detected_all = pickle.load(f0)
pathoMarkups = create_sentence_conTextMarkup_parallel(sentences, itemData['pathophysiology_metaclass'], mode="target")
detected_patho = [p for p in pathoMarkups if p.nodes()] print(len(detected_patho))
anatMarkups = create_sentence_conTextMarkup_parallel(sentences, itemData['anatomy_metaclass'], mode="target") detected_anat = [p for p in anatMarkups if p.nodes()] print(len(detected_anat))
In [ ]:
for i in range(20):
print(detected_all[i].nodes())
print(detected_all[i].graph["__text"])
In [ ]:
def analyzeReport(report, targets ):
"""
given an individual radiology report, creates a pyConTextGraph
object that contains the context markup
report: a text string containing the radiology reports
"""
sentences = report2sentences(report)
markups = list(map(create_sentence_conTextMarkup,sentences, report, targets,[]))
return markups
In [ ]:
c2 = sq.connect(os.path.join(RADDIR,"radlex_terms_expanded.sqlite"))
cu2 = c2.cursor()