In [ ]:
import rdflib
import sqlite3 as sq
import functools
import numpy as np
import ipyparallel as ipp
import os
import re
import pprint
from textblob import TextBlob
import functools
import itertools
import pickle
import collections
import gzip

In [ ]:
GRABRADLEX = True
GRABREPORTS = True
GRABMARKUPS = False

In [ ]:
RADDIR = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'radlex')
os.path.exists(RADDIR)
print("RADDIR: ", RADDIR)
# this appears to not be used at all
#NLPDIR = os.path.join(os.path.expanduser('~'),'Dropbox','NLP')
#os.path.exists(NLPDIR)
#print("NLPDIR: ", NLPDIR)

In [ ]:
# before this block can be run, execute "ipcluster start -n 4" via commandline
# see more info at http://ipyparallel.readthedocs.org/en/stable/process.html
rc = ipp.Client()

dview = rc[:]
rc.ids

In [ ]:
with dview.sync_imports():
    from textblob import TextBlob
    import numpy
    #import tagObjects
    import pyConTextNLP

g=rdflib.Graph() g.load(os.path.join(RADDIR,"Radlex_3.12.owl"))

qres = g.query("""SELECT DISTINCT ?type WHERE {?s a ?type.}""")

In creating the regular expressions from the RadLex terms, I've learned from sad (and slow) experience the need to have the word boundaries around the phrases.


In [ ]:
def create_regex(r1,r2):
    if r2:
        return r"\b%s\b|\b%s\b"%(r1,r2)
    else:
        return r"\b%s\b"%r1

In [ ]:
print("Grabradlex: ", GRABRADLEX)
if GRABRADLEX:
    g = rdflib.Graph()
    #result = g.parse("http://data.bioontology.org/ontologies/RADLEX/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=rdf",
    #                 format="xml")
    
    result = g.parse(os.path.join(RADDIR, 'Radlex3.13.1.owl'), format="xml")
    print("result: ", result)

In [ ]:
import pyConTextNLP.functional.itemData as ID
import pyConTextNLP.functional.conTextItem as CI
import pyConTextNLP.functional.ConTextMarkup as CM

In [ ]:
if GRABRADLEX:
    g = rdflib.Graph()
    #result = g.parse("http://data.bioontology.org/ontologies/RADLEX/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=rdf",
    #                 format="xml")
    
    result = g.parse(os.path.join(RADDIR, 'Radlex3.13.1.owl'), format="xml")

    query_results = {}
    types = ["anatomy_metaclass","pathophysiology_metaclass","imaging_observation_metaclass"]
    q1 = """ 

    PREFIX Ontology1447432460: <http://www.owl-ontologies.com/Ontology1447432460.owl#>

    SELECT DISTINCT ?s ?preferred_name ?synonym ?type

    WHERE {

        ?s Ontology1447432460:Preferred_name ?preferred_name. 
        ?s rdf:type ?type
        OPTIONAL {?s Ontology1447432460:Synonym ?synonym.}

        FILTER  regex(str(?type), "%s")}"""
    #print(q1)

    for t in types:
        q = q1%t
        print(q)
        query_results[t] = g.query(q)

    for t,r in query_results.items():
        print(t,len(r))
        print("*"*42)

    rslts = g.query(q)

    itemData = {}
    for t in types:
        itemData[t] = [CI.create_ConTextItem([r[1],
                                              r[3].split("#")[1],
                                              create_regex(r[1],r[2]),
                                              ""]) \
                    for r in query_results[t]]
    with open(os.path.join(RADDIR,"DBs","radlex.pickle"),"wb") as f0:
        pickle.dump(itemData,f0)
else:
    with open(os.path.join(RADDIR,"DBs","radlex.pickle"),"rb") as f0:
        itemData = pickle.load(f0)

count = 0 for r in query_results["pathophysiology_metaclass"]: print(r[1],r[2]) count += 1 if count >=50: break

Do some printing for verifying things look as expected


In [ ]:
for i in itemData['pathophysiology_metaclass'][0:50]:
    print(i)
    #print("LITERAL: %s; RE: %s"%(i.literal, i.re))

tmp = [r for r in query_results["pathophysiology_metaclass"] \ if "short bowel" in r[1]] tmp[0][2].toPython()

Import functional form of pyConTextNLP


In [ ]:
def create_sentence_conTextMarkup(s, targets ):
    markup = CM.create_ConTextMarkup()
    markup = CM.setRawText(markup, s)
    markup = CM.cleanText(markup)
    markup = CM.mark_items_in_text(markup, targets, mode="target")
    markup = CM.pruneMarks(markup)
    markup = CM.dropMarks(markup, category='Exclusion')
    # apply modifiers to any targets within the modifiers scope
    return markup

Use the IPython parallel function to speed up the processing

I've had difficulty getting functools partial to build up a robust set of functions with the arguments frozen.


In [ ]:
def create_sentence_conTextMarkup_parallel(sentences, targets, mode):
    mark_items_in_text = functools.partial(CM.mark_items_in_text,
                                           items=targets, 
                                           mode=mode)
    dropMarks = functools.partial(CM.dropMarks, category='Exclusion')
    sm = [CM.create_ConTextMarkup() for i in range(len(sentences))]
    sm = dview.map_async(CM.setRawText, sm, sentences).get()
    sm = dview.map_async(CM.cleanText,sm).get()
    sm = dview.map_async(mark_items_in_text,sm).get()
    sm = dview.map_async(CM.pruneMarks,sm).get()
    sm = dview.map_async(dropMarks,sm).get()
    return sm
    
    
    


    # apply modifiers to any targets within the modifiers scope
    return markup

In [ ]:
all_items = []
for key,items in itemData.items():
    all_items.extend(items)

In [ ]:
len(all_items)

Get Data

Pull all the reports from the database and convert to individual sentences.


In [ ]:
if GRABREPORTS:
    conn = sq.connect(os.path.join(RADDIR,"DBs",
                                   "criticalFindingsAll.sqlite"))
    cursor = conn.cursor()

    r_date  = re.compile(r"""([0-9]{1,2}(/[0-9]{1,2})?/[0-9]{2,4})""")
    r_time = re.compile(r"""([0-9]{1,2}:\d\d(pm|am)?)""")
    cursor.execute("""SELECT rowid,impression from reports""")
    data = [(d[0],d[1].lower()) for d in cursor.fetchall()]
    data =[(d[0],r_date.sub("",r_time.sub("",d[1]))) for d in data]

    reports = [d[1] for d in data]

    def report2sentences(report):
        return TextBlob(report).raw_sentences

    sentences = list(itertools.chain.from_iterable(dview.map_async(report2sentences,reports).get()))

Markup sentences with pyConTextNLP items generated from RadLex

Keep non-zero markups


In [ ]:
if GRABMARKUPS:
    allMarkups = create_sentence_conTextMarkup_parallel(sentences, 
                                           all_items,
                                           mode="target")

    detected_all = [p for p in allMarkups if p.nodes()]
    print(len(detected_all))


    import gzip
    with gzip.open(os.path.join(RADDIR,"NLP","DBs","radlex_found_sentences.pickle"),"wb") as f0:
        pickle.dump(detected_all,f0)
else:
    with gzip.open(os.path.join(RADDIR,"NLP","DBs","radlex_found_sentences.pickle"),"rb") as f0:
        detected_all = pickle.load(f0)

pathoMarkups = create_sentence_conTextMarkup_parallel(sentences, itemData['pathophysiology_metaclass'], mode="target")

detected_patho = [p for p in pathoMarkups if p.nodes()] print(len(detected_patho))

anatMarkups = create_sentence_conTextMarkup_parallel(sentences, itemData['anatomy_metaclass'], mode="target") detected_anat = [p for p in anatMarkups if p.nodes()] print(len(detected_anat))


In [ ]:
for i in range(20):
    print(detected_all[i].nodes())
    print(detected_all[i].graph["__text"])

In [ ]:
def analyzeReport(report, targets ):
        """
        given an individual radiology report, creates a pyConTextGraph
        object that contains the context markup
        report: a text string containing the radiology reports
        """
        sentences = report2sentences(report)
        markups = list(map(create_sentence_conTextMarkup,sentences, report, targets,[]))
        return markups

In [ ]:
c2 = sq.connect(os.path.join(RADDIR,"radlex_terms_expanded.sqlite"))
cu2 = c2.cursor()