In [2]:
# add ieml library to kernel path
import sys
sys.path.insert(0, '..')
The ieml library exposes a parser (ieml.usl.parser) that parses ieml strings (including older versions) and return a normalised ieml.usl.USL object or a ieml.dictionary.Script (a morpheme). For the moment only ieml.usl.Word are valid ieml.usl.USL.
The ieml.usl.USL can be checked for coherancy with ieml.usl.USL.check(). That feature should be added in a strict mode for the parser.
In [3]:
from ieml.usl.usl import usl
u = usl("[E:.b.E:B:.- E:S:. (E:.-wa.-t.o.-' E:.-'wu.-S:.-'t.o.-',)(a.T:.-) > ! E:.l.- (E:.wo.- E:S:.-d.u.-')]")
u.check()
print(u)
u1 = usl("[E:.b.E:B:.- E:S:. (E:.-'wu.-S:.-'t.o.-', E:.-wa.-t.o.-' )(a.T:.-) > ! E:.l.- (E:.wo.- E:S:.-d.u.-')]")
u1.check()
print(u1)
assert u1 == u
The ieml lexicons are stored on github, they have to be downloaded first
In [4]:
from ieml.ieml_database import GitInterface, IEMLDatabase
gitdb = GitInterface()
gitdb.pull() # download database in ~/.cache/ieml/ folder
print(gitdb)
# instanciate a ieml.ieml_database.IEMLDatabase from the downloaded git repository
db = IEMLDatabase(folder=gitdb.folder)
print(db)
In [8]:
morphs = db.list(type='morpheme')[:100]
The ieml.ieml_database.IEMLDatabase is responsible of reading and writing association between ieml.usl.USL and ieml.dictionary.Script with their translation to disk. The values as stored in rows in one file per USL. The format of the rows are space separated values (ssv like csv or tsv).
In [12]:
desc = db.get_descriptors()
desc.get_values_partial(morphs[0])
Out[12]:
In [5]:
w = db.list(type='word', parse=True)[1]
In [6]:
list(w.iter_structure())
Out[6]:
In [4]:
# ou se trouve le pointd'exclamation
str(u.role)
Out[4]:
In [5]:
print('\n'.join(str(r) + ' ' + str(a.actor) for r, a in u.syntagmatic_fun.actors.items()))
In [6]:
from ieml.usl.constants import ADDRESS_SCRIPTS,NAMES_TO_ADDRESS
# list des roles syntagmatic
print('\n'.join(str(r) + ' ' + NAMES_TO_ADDRESS[r] for r in ADDRESS_SCRIPTS))
In [7]:
from requests import get
from ieml.usl.word import Word
from ieml.usl.usl import usl
def get_word_structure(w: Word):
return get("https://dev.intlekt.io/api/words/{}/?repository=IEMLdev".format(str(w))).json()
"""
The structure for any Ieml is :
IemlEntry = {
'ieml': string,
'cardinality': 'singular_sequence' | 'paradigm' | 'root_paradigm',
'class': 'Noun'|'Verb'|'Auxialiary',
'type': 'word' | 'morpheme' | 'polymorpheme' | 'lexeme',
'comments': {'en': [], 'fr': []},
'tags': {'en': [], 'fr': []},
'translations': {'en': [], 'fr': []},
'created': True|False, # if exists in db, equivalent of 'comments', 'tags' or'translations' at a least one value
'editable': True|False, # if exists in db and not in the main db or doesn't exists in db
'domains': [],
'index': string, # string value to order the usls from each other
'main_table': None, # main table for morpheme
'paradigm': True|False, # is a paradigm ?
'singular_sequences': None|IemlEntry[], # if not a paradigm, None, otherwise the list of the singular sequences (the cells of the table)
}
For Words, we had the following entries :
WordsEntry = IemlEntry + {
'role': string[], # the tree address where to put the '!'
'syntagmatic_function': SyntagmaticFunctionEntry # the tree
}
LexemeEntry = IemlEntry + {
'pm_content': PolyMorphemeEntry,
'pm_flexion': PolyMorphemeEntry
}
PolyMorphemeEntry = IemlEntry + {
'constant': MorphemeEntry[], # the constant of the polymorphemes
'groups': (MorphemeEntry, 0|1|2)[], # the variables with theirs multiplicities.
}
MorphemeEntry = IemlEntry
The tree structure : a tree of subtype of SyntagmaticFunctionEntry. The nodes are accessed with the actor property.
SyntagmaticFunctionEntry = {
'actor': LexemeEntry, # the lexeme at this node in the tree
'role_full': string[], # the address of thisnode in the tree
'role': string, # the last value of the address (role_full[-1])
'type': 'ProcessSyntagmaticFunction'| 'DependantQualitySyntagmaticFunction'|'IndependantQualitySyntagmaticFunction',
# There is 3 types of syntagmatic functions, process for verbal frames, dependant for actants (nouns) and independant for adjectives.
}
ProcessSyntagmaticFunctionEntry = SyntagmaticFunctionEntry + {
'valence': 1|2|3,
'initiator': DependantQualitySyntagmaticFunctionEntry,
'recipient': DependantQualitySyntagmaticFunctionEntry, # always None is valence < 2
'interactant': DependantQualitySyntagmaticFunctionEntry, # always None is valence < 3
'cause': DependantQualitySyntagmaticFunctionEntry,
'intention': DependantQualitySyntagmaticFunctionEntry,
'manner': DependantQualitySyntagmaticFunctionEntry,
'time': DependantQualitySyntagmaticFunctionEntry,
'location': DependantQualitySyntagmaticFunctionEntry,
}
DependantQualitySyntagmaticFunctionEntry = SyntagmaticFunctionEntry + {
'independant': IndependantQualitySyntagmaticFunction,
'dependant': DependantQualitySyntagmaticFunctionEntry
}
IndependantQualitySyntagmaticFunction = SyntagmaticFunctionEntry
"""
get_word_structure(usl("[! E:A:. (wa.)]"))
Out[7]:
In [19]:
from itertools import chain
def list_polymorpheme_of_word(w):
w = usl(w)
assert isinstance(w, Word)
# w.syntagmatic_fun correspond à l'arbre syntagmatic du mot w
# w.syntagmatic_fun.actors correspond à un dictionnaire qui associe tous les roles vers toutes les fonction syntagmatics presentent en descendance du noeud courant
# donc sfun correspond successivement à chacun des noeuds de l'arbre
# sfun.actor correspond au lexeme du noeud sfun
# sfun.actor.pm_flexion correspond au polymorpheme de flexion et
# sfun.actor.pm_content correspond au polymorpheme de contenu
return list(chain.from_iterable((sfun.actor.pm_content, sfun.actor.pm_flexion)
for sfun in w.syntagmatic_fun.actors.values()))
pl = list_polymorpheme_of_word("[! E:A:. (E:.wo.- E:S:.-d.u.-')(b.-S:.A:.-'S:.-'S:.-',) > E:A:. E:A:. (E:.wo.- E:S:.-d.u.-')(k.a.-k.a.-')]")
for pm in pl:
print(pm)
In [8]:
# parse all words
usls = db.list(parse=False, type='word')
usls
Out[8]:
In [3]:
from ieml.usl import USL
from ieml.dictionary.script import Script
# the database contains the morphemes and the usls made from morphemes
all(isinstance(u, (USL, Script)) for u in usls)
Out[3]:
In [35]:
descriptorsDB = db.get_descriptors()
def display_usls(u):
descriptor = descriptorsDB.get_values_partial(u)
# descriptor is a dict :
# { (ieml, descriptor_type, lang) : string[]}
# descriptor_type in ieml.constants.DESCRIPTORS_CLASS = ['translations', 'comments', 'tags']
# lang in ieml.constants.LANGUAGES = [ 'fr', 'en' ]
return str(u) +\
"".join("\n\t{}: {}".format(k, str(v)) for k, v in {
**{'{}_{}'.format(descriptor_type, lang): ', '.join(t)
for (_, descriptor_type, lang), t in descriptor.items()},
'type': u.__class__.__name__,
'is_paradigm': not u.is_singular,
}.items())
In [36]:
from ieml.usl.usl import usl
# usl() parse the string and return an Usl or a Script (if a morpheme)
oo_script = usl('O:O:.')
assert isinstance(oo_script, Script)
print(display_usls(usl('O:O:.')))
In [ ]: