In [1]:
import os

import processors
from processors import *
from processors.visualization import JupyterVisualizer as viz

print(processors.__version__)


3.2.1

Initialize the NLP API


In [2]:
API = ProcessorsAPI(port=8886, keep_alive=True)


INFO - Connection with server established!
INFO - Server version meets recommendations (v3.1.0)

In [3]:
demo_dir = os.path.split(os.getcwd())[0]
docs_dir = os.path.join(demo_dir, "docs")
grammar_dir = os.path.join(demo_dir, "grammar")

print("DEMO DIRECTORY:\t{}".format(demo_dir))
print("DOCS DIRECTORY:\t{}".format(docs_dir))
print("GRAMMAR DIRECTORY:\t{}".format(grammar_dir))


DEMO DIRECTORY:	/Users/gus/repos/odin-tutorial/domains/molecular-biology
DOCS DIRECTORY:	/Users/gus/repos/odin-tutorial/domains/molecular-biology/docs
GRAMMAR DIRECTORY:	/Users/gus/repos/odin-tutorial/domains/molecular-biology/grammar

In [4]:
raw_text_files = [os.path.join(docs_dir, f) for f in os.listdir(docs_dir) if f.endswith(".txt")]

In [5]:
documents = [API.bionlp.annotate(open(f).read()) for f in raw_text_files]

In [6]:
# TODO: Add to py-processors
def display_mentions(mentions):
    for m in mentions:
        print(m.document.id if m.document.id else id(m.document))
        viz.display_mention(m)
        
def deserialize_document(doc):
    if json_file.endswith(".json"):
        fpath = os.path.join(serialized_docs_dir, json_file)
        with open(fpath) as infile:
            text = infile.read()
        doc_dict = json.loads(text)
        doc = Document.load_from_JSON(doc_dict)
        # FIXED, but not released (ADD TEST)
        # py-processors v3.0.3 wasn't retrieving the doc id
        doc.id = os.path.splitext(json_file)[0]
        return doc
    else: 
        return None
    
def deserialize_documents(serialized_docs_dir):
    for json_file in os.listdir(serialized_docs_dir):
        if json_file.endswith(".json"):
            doc = deserialize_document(json_file)
            yield doc

# TODO: add to py-processors
def extract_from_documents(documents, rules, API):
    for doc in documents:
        mentions = API.odin.extract_from_document(doc, rules)
        for m in sorted(mentions, key=lambda x: (x.document.id, x.sentence, x.start), reverse=False):
            yield m

Linguistic analysis

Take a look at the linguistic annotations automatically produced by the NLP pipeline.


In [7]:
viz.display_graph(documents[0].sentences[0], css=viz.parse_css)


  • The top row is the list of tokenized words
  • The next row is the list of part of speech (PoS) tags
  • The next row is the list of lemmas (cannonicalized forms of each word)
  • The bottom row is the list of named entity labels generated by the named entity recognizer, a satistical model that predicts an entity label for each word (O means no entity label)

Load Odin rules

Let's take a look at our rules...

Master file

The master file contains our taxonomy, variables, rule set imports. In this example, each set of rules is instantiated with variables. The use of variables allows us to recycle structures and build concise grammars.


In [8]:
my_prefix = "/Users/gus/repos/odin-tutorial"
master_file = open(os.path.join(grammar_dir, "master.yml"), "r").read().replace("/Users/gus/repos/odin-tutorial", my_prefix)
print(master_file)


taxonomy:
  - Entity:
    - Protein:
      - Mutant
  - Event:
    # Events capture post-translational modifications (PTMs)
    - PTMEvent:
      - Phosphorylation
      - Ubiquitination
    # events that take other events as arguments
    - ComplexEvent:
      - Regulation:
        - PositiveRegulation
        - NegativeRegulation
      - Activation:
        - PositiveActivation
        - NegativeActivation

vars:
  # priorities for entities and events
  simpleEventPriority: 2
  complexEventPriority: 3+
  prefix: file:/Users/gus/repos/odin-tutorial

rules:

  # Entities
  - import: ${prefix}/domains/molecular-biology/grammar/entities.yml
    vars:
      priority: 1

  # Phosphorylation
  - import: ${prefix}/domains/molecular-biology/grammar/ptm_events.yml
    vars:
      priority: ${simpleEventPriority}
      triggerStem: "/(?i)^phosphorylat/"
      eventLabel: "Phosphorylation"

  # Ubiquitination
  - import: ${prefix}/domains/molecular-biology/grammar/ptm_events.yml
    vars:
      priority: ${simpleEventPriority}
      triggerStem: "/(?i)^ubiquitinat/"
      eventLabel: "Ubiquitination"

  # Positive Regulations
  - import: ${prefix}/domains/molecular-biology/grammar/complex_events.yml
    vars:
      priority: ${complexEventPriority}
      triggerStem: "/(?i)^(inhibit|reduc|downreg)/"
      eventLabel: "PositiveRegulation"

  # Negative Regulations
  - import: ${prefix}/domains/molecular-biology/grammar/complex_events.yml
    vars:
      priority: ${complexEventPriority}
      triggerStem: "/(?i)^(promot|increas|potentiat)/"
      eventLabel: "NegativeRegulation"

Apply rules to a set of analyzed Documents


In [9]:
mentions = list(extract_from_documents(documents, master_file, API))

In [10]:
viz.display_graph(documents[0].sentences[0], css=viz.parse_css)


Step 1: Find entities


In [11]:
print(open(os.path.join(grammar_dir, "entities.yml")).read())


- name: "protein-1"
  priority: ${priority}
  label: Protein
  type: token
  pattern: |
    [entity="B-Gene_or_gene_product"] [entity="I-Gene_or_gene_product"]* [lemma=mutant]?


In [12]:
display_mentions(m for m in mentions if m.matches("Entity"))


4519445280
The ProteinCYLD mutant inhibits the ubiquitination of both TRAF2 and TRAF6 .
4519445280
The CYLD mutant inhibits the ubiquitination of both ProteinTRAF2 and TRAF6 .
4519445280
The CYLD mutant inhibits the ubiquitination of both TRAF2 and ProteinTRAF6 .

Step 2: Find simple events


In [13]:
print(open(os.path.join(grammar_dir, "ptm_events.yml")).read())


# Event rules for post-translational modifications

- name: "${eventLabel}-nominal-1"
  priority: ${priority}
  label: ${eventLabel}
  pattern: |
    trigger = [lemma=${triggerStem} & tag=/^N/]
    patient:Entity = prep_of

- name: "${eventLabel}-verbal-1"
  priority: ${priority}
  label: ${eventLabel}
  pattern: |
    trigger = [lemma=${triggerStem} & tag=/^V/]
    patient:Entity = >nsubj


In [14]:
display_mentions(m for m in mentions if m.matches("PTMEvent"))


4519445280
The CYLD mutant inhibits the UbiquitinationubiquitinationTRIGGER of both TRAF2 and ProteinTRAF6patient .
4519445280
The CYLD mutant inhibits the UbiquitinationubiquitinationTRIGGER of both ProteinTRAF2patient and TRAF6 .

Step 3: Find nested events


In [15]:
print(open(os.path.join(grammar_dir, "complex_events.yml")).read())


# ComplexEvent rules

- name: "${eventLabel}-nominal-1"
  priority: ${priority}
  label: ${eventLabel}
  pattern: |
    trigger = [lemma=${triggerStem} & tag=/^N/]
    patient:PTMEvent = >prep_of
    agent:Entity = >nn

- name: "${eventLabel}-verbal-1"
  priority: ${priority}
  label: ${eventLabel}
  pattern: |
    trigger = [lemma=${triggerStem} & tag=/^V/]
    patient:PTMEvent = dobj
    agent:Entity = >nsubj


In [16]:
display_mentions(m for m in mentions if m.matches("ComplexEvent"))


4519445280
The PositiveRegulationProteinCYLD mutantagent inhibitsTRIGGER the Ubiquitinationubiquitination of both TRAF2patient and TRAF6 .
4519445280
The PositiveRegulationProteinCYLD mutantagent inhibitsTRIGGER the Ubiquitinationubiquitination of both TRAF2 and TRAF6patient .