In [1]:
import os

import processors
from processors import *
from processors.visualization import JupyterVisualizer as viz

print(processors.__version__)


3.2.1

In [2]:
API = ProcessorsAPI(port=8886, keep_alive=True)


INFO - Connection with server established!
INFO - Server version meets recommendations (v3.1.0)

In [3]:
demo_dir = os.path.split(os.getcwd())[0]
docs_dir = os.path.join(demo_dir, "docs")
grammar_dir = os.path.join(demo_dir, "grammar")

print("DEMO DIRECTORY:\t{}".format(demo_dir))
print("DOCS DIRECTORY:\t{}".format(docs_dir))
print("GRAMMAR DIRECTORY:\t{}".format(grammar_dir))


DEMO DIRECTORY:	/Users/gus/repos/odin-tutorial/domains/agro
DOCS DIRECTORY:	/Users/gus/repos/odin-tutorial/domains/agro/docs
GRAMMAR DIRECTORY:	/Users/gus/repos/odin-tutorial/domains/agro/grammar

In [4]:
# TODO: Add to py-processors
def display_mentions(mentions):
    for m in mentions:
        print(m.document.id if m.document.id else id(m.document))
        viz.display_mention(m)
        
def deserialize_document(doc):
    if json_file.endswith(".json"):
        fpath = os.path.join(serialized_docs_dir, json_file)
        with open(fpath) as infile:
            text = infile.read()
        doc_dict = json.loads(text)
        doc = Document.load_from_JSON(doc_dict)
        # FIXED, but not released (ADD TEST)
        # py-processors v3.0.3 wasn't retrieving the doc id
        doc.id = os.path.splitext(json_file)[0]
        return doc
    else: 
        return None
    
def deserialize_documents(serialized_docs_dir):
    for json_file in os.listdir(serialized_docs_dir):
        if json_file.endswith(".json"):
            doc = deserialize_document(json_file)
            yield doc

# TODO: add to py-processors
def extract_from_documents(documents, rules, API):
    for doc in documents:
        mentions = API.odin.extract_from_document(doc, rules)
        for m in sorted(mentions, key=lambda x: (x.document.id, x.sentence, x.start), reverse=False):
            yield m

In [5]:
raw_text_files = [os.path.join(docs_dir, f) for f in os.listdir(docs_dir) if f.endswith(".txt")]

In [6]:
documents = [API.fastnlp.annotate(open(f).read()) for f in raw_text_files]

In [7]:
grammar_dir = os.path.join(demo_dir, "grammar")
my_prefix = "/Users/gus/repos/odin-tutorial"
master_file = open(os.path.join(grammar_dir, "master.yml"), "r").read().replace("/Users/gus/repos/odin-tutorial", my_prefix)
print(master_file)


taxonomy:
  - Entity:
    - Quantity
    - Percentage
    - SoilComponent
    - Crop
    - Treatment:
      - Nitrogen
  - Event:
    - TreatmentApplication
    - SoilComposition

vars:
    soilType: "sand|silt|clay|peat|loam"
    prefix: file:/Users/gus/repos/odin-tutorial

rules:
  - import: ${prefix}/domains/agro/grammar/entities.yml

  - import: ${prefix}/domains/agro/grammar/events.yml


In [8]:
print(open(os.path.join(grammar_dir, "events.yml"), "r").read())


#
# treatment was applied
#
- name: treatment-applied-passive
  priority: 10
  label: TreatmentApplication
  pattern: |
    trigger = [lemma=/apply|perform/ & tag=/^V/]
    treatment:Treatment = nsubjpass
    quantity:Quantity = prep_at|prep_followed_by

#
# soil composition
#
- name: soil-composition-surface
  priority: 10
  label: SoilComposition
  type: token
  pattern: |
    (?<soil> [tag = "NNP"]* @SoilComponent)
    "-LRB-"
    (","? (?<component> @Percentage @SoilComponent))+
    "-RRB-"


In [9]:
mentions = list(extract_from_documents(documents, master_file, API))

In [10]:
display_mentions([m for m in mentions if m.matches("Event")])


4372175896
Blocks were arranged to accommodate soil and topographic conditions as follows : block one -LRB- b1 -RRB- on a south facing slope with a SoilCompositionSoilCompositionTuscola loamsoil -LRB- SoilComposition28 % sandcomponent , SoilComposition60 % siltcomponent , SoilComposition12 % claycomponent -RRB- ; block two -LRB- b2 -RRB- on a slight north facing slope with Bookton sand -LRB- 70 % sand , 24 % silt , 6 % clay -RRB- ; and block three -LRB- b3 -RRB- with a slight rise in the centre on a Fox sand -LRB- 64 % sand , 29 % silt , 7 % clay -RRB- .
4372175896
Blocks were arranged to accommodate soil and topographic conditions as follows : block one -LRB- b1 -RRB- on a south facing slope with a Tuscola loam -LRB- 28 % sand , 60 % silt , 12 % clay -RRB- ; block two -LRB- b2 -RRB- on a slight north facing slope with SoilCompositionSoilCompositionBookton sandsoil -LRB- SoilComposition70 % sandcomponent , SoilComposition24 % siltcomponent , SoilComposition6 % claycomponent -RRB- ; and block three -LRB- b3 -RRB- with a slight rise in the centre on a Fox sand -LRB- 64 % sand , 29 % silt , 7 % clay -RRB- .
4372175896
Blocks were arranged to accommodate soil and topographic conditions as follows : block one -LRB- b1 -RRB- on a south facing slope with a Tuscola loam -LRB- 28 % sand , 60 % silt , 12 % clay -RRB- ; block two -LRB- b2 -RRB- on a slight north facing slope with Bookton sand -LRB- 70 % sand , 24 % silt , 6 % clay -RRB- ; and block three -LRB- b3 -RRB- with a slight rise in the centre on a SoilCompositionSoilCompositionFox sandsoil -LRB- SoilComposition64 % sandcomponent , SoilComposition29 % siltcomponent , SoilComposition7 % claycomponent -RRB- .
4372175896
TreatmentApplicationNitrogenNitrogentreatment was appliedTRIGGER at Quantity45 kg haquantity followed by 45 kg ha as a sidedress when tomato plants were established .
4372175896
TreatmentApplicationNitrogenNitrogentreatment was appliedTRIGGER at 45 kg ha followed by Quantity45 kg haquantity as a sidedress when tomato plants were established .

In [11]:
viz.display_graph(documents[0].sentences[2])



In [12]:
example = """
    
# capturing locations

- name: "location-1"
  label: Location
  type: token
  pattern: |
    [lemma=/^(north|south|east|west)/]+ # starts with a direction word
    []{,2} # 0-2 intervening tokens
    [lemma=axis]
    
- name: "location-2"
  label: Location
  type: token
  pattern: |
    [tag=/^NNP/ & incoming=prep_near] # proper noun w/ incoming "near" prep
    ("," [tag=NNP])*

- name: "location-3"
  label: Location
  type: token
  pattern: |
    [tag=/^NNP/]* 
    [tag=/^NNP/ & incoming=prep_at] 
    [tag=/^NNP/]*
"""


mns = list(extract_from_documents(documents, example, API))
display_mentions(mns)


4522568280
Experiments were located in the same site each year at the LocationHorticultural Experiment Station near Simcoe , Ontario , Canada .
4522568280
Experiments were located in the same site each year at the Horticultural Experiment Station near LocationSimcoe , Ontario , Canada .
4522568280
The experimental site was oriented on a Locationnorth -- south axis .