In [1]:
from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading
Let's create a simple example document with a single heading followed by a single paragraph:
In [2]:
d = Document(
Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
Paragraph(u'The procedure was followed to yield a pale yellow solid (b.p. 240 °C)')
)
What does this look like:
In [3]:
d
Out[3]:
In [4]:
d.records.serialize()
Out[4]:
In [5]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType
class BoilingPoint(BaseModel):
value = StringType()
units = StringType()
Compound.boiling_points = ListType(ModelType(BoilingPoint))
In [6]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
prefix = (R(u'^b\.?p\.?$', re.I) | I(u'boiling') + I(u'point')).hide()
units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge)
value = R(u'^\d+(\.\d+)?$')(u'value')
bp = (prefix + value + units)(u'bp')
In [7]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first
class BpParser(BaseParser):
root = bp
def interpret(self, result, start, end):
compound = Compound(
boiling_points=[
BoilingPoint(
value=first(result.xpath('./value/text()')),
units=first(result.xpath('./units/text()'))
)
]
)
yield compound
In [8]:
Paragraph.parsers = [BpParser()]
In [9]:
d = Document(
Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
Paragraph(u'The procedure was followed to yield a pale yellow solid (b.p. 240 °C)')
)
d.records.serialize()
Out[9]: