ETK 101

Outline:

  • important concepts
  • explained via example

In [ ]:
import json
from etk.etk import ETK
from etk.extractors.glossary_extractor import GlossaryExtractor
from etk.etk_module import ETKModule

In [23]:
etk = ETK()

In [24]:
sample_input = {
        "projects": [
            {
                "name": "etk",
                "description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others."
            },
            {
                "name": "rltk",
                "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
            }
        ]
    }

In [25]:
doc = etk.create_document(sample_input)


/Users/pszekely/github/etk-pedro/etk/document.py:42: UserWarning: Schema not found.
  warnings.warn("Schema not found.")

In [26]:
name_extractor = GlossaryExtractor(etk.load_glossary("./examples/hello_world/names.txt"), "name_extractor",
                                                etk.default_tokenizer,
                                                case_sensitive=False, ngrams=1)

In [27]:
descriptions = doc.select_segments("projects[*].description")
projects = doc.select_segments("projects[*]")

In [28]:
for d, p in zip(descriptions, projects):
    names = doc.extract(name_extractor, d)
    p.store(names, "members")

In [29]:
print(json.dumps(doc.value, indent=2))


{
  "projects": [
    {
      "name": "etk",
      "description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.",
      "members": [
        "Runqi",
        "Dongyu",
        "Sylvia",
        "Amandeep"
      ]
    },
    {
      "name": "rltk",
      "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
      "members": [
        "Pedro",
        "Mayank",
        "Yixiang"
      ]
    }
  ],
  "provenances": [
    {
      "@type": "extraction_provenance_record",
      "@id": 0,
      "method": "name_extractor",
      "confidence": 1.0,
      "origin_record": [
        {
          "path": "projects.[0].description",
          "start_char": 33,
          "end_char": 38
        }
      ]
    },
    {
      "@type": "extraction_provenance_record",
      "@id": 1,
      "method": "name_extractor",
      "confidence": 1.0,
      "origin_record": [
        {
          "path": "projects.[0].description",
          "start_char": 40,
          "end_char": 46
        }
      ]
    },
    {
      "@type": "extraction_provenance_record",
      "@id": 2,
      "method": "name_extractor",
      "confidence": 1.0,
      "origin_record": [
        {
          "path": "projects.[0].description",
          "start_char": 48,
          "end_char": 54
        }
      ]
    },
    {
      "@type": "extraction_provenance_record",
      "@id": 3,
      "method": "name_extractor",
      "confidence": 1.0,
      "origin_record": [
        {
          "path": "projects.[0].description",
          "start_char": 56,
          "end_char": 64
        }
      ]
    },
    {
      "@type": "storage_provenance_record",
      "doc_id": null,
      "field": null,
      "destination": "projects.[0].members",
      "extraction_provenance_record_id": [
        0,
        1,
        2,
        3
      ]
    },
    {
      "@type": "extraction_provenance_record",
      "@id": 4,
      "method": "name_extractor",
      "confidence": 1.0,
      "origin_record": [
        {
          "path": "projects.[1].description",
          "start_char": 39,
          "end_char": 44
        }
      ]
    },
    {
      "@type": "extraction_provenance_record",
      "@id": 5,
      "method": "name_extractor",
      "confidence": 1.0,
      "origin_record": [
        {
          "path": "projects.[1].description",
          "start_char": 46,
          "end_char": 52
        }
      ]
    },
    {
      "@type": "extraction_provenance_record",
      "@id": 6,
      "method": "name_extractor",
      "confidence": 1.0,
      "origin_record": [
        {
          "path": "projects.[1].description",
          "start_char": 54,
          "end_char": 61
        }
      ]
    },
    {
      "@type": "storage_provenance_record",
      "doc_id": null,
      "field": null,
      "destination": "projects.[1].members",
      "extraction_provenance_record_id": [
        4,
        5,
        6
      ]
    }
  ]
}

In [30]: