In [1]:
import os, json
from pprint import pprint
In [111]:
with open('verbs_unparsed.json') as f:
chunks = json.load(f)
In [112]:
for ch in chunks:
ch['stem'] = list(map(lambda x: x.strip('.,;*()[]:&?! '), ch['stem'].split('|')))
ch['lex'] = ch['lex'].strip('.,;*()[]:&?! ')
In [113]:
verbs = []
undefined = []
twos = []
undef_twos = []
for i in range(len(chunks)):
if chunks[i]['gramm'] == 'V':
if len(chunks[i]['stem'])<2:
if '/' not in ''.join(chunks[i]['stem']):
verbs.append(i)
else:
undefined.append(i)
else:
if '/' in ''.join(chunks[i]['stem']):
undef_twos.append(i)
else:
twos.append(i)
len(verbs)
Out[113]:
In [114]:
print(len(twos))
print(len(undef_twos))
print(len(undefined))
In [115]:
for i in undefined:
chunks[i]['stem'] = list(map(lambda x: x.strip('.'), chunks[i]['stem'][0].split('//')))
for i in undef_twos:
chunks[i]['stem'] = [chunks[i]['stem'][0]] + chunks[i]['stem'][1].split('.//.')
if '//' in ''.join(chunks[i]['stem']):
print(i)
pprint(chunks[i])
In [116]:
for i in [601, 1036, 1443]:
chunks[i]['stem'] = list(map(lambda x: x.strip('.'), chunks[i]['stem'][0].split('//'))) + chunks[i]['stem'][1:]
In [117]:
for i in range(len(chunks)):
chunks[i]['lex'] = chunks[i]['lex'].replace('ң', 'ӈ')
chunks[i]['lex'] = chunks[i]['lex'].replace('қ', 'ӄ')
chunks[i]['lex'] = chunks[i]['lex'].replace('л', 'ԓ')
chunks[i]['lex'] = chunks[i]['lex'].replace("'", 'ʼ')
chunks[i]['stem'] = list(map(lambda x: x.replace('ң', 'ӈ'), chunks[i]['stem']))
chunks[i]['stem'] = list(map(lambda x: x.replace('қ', 'ӄ'), chunks[i]['stem']))
chunks[i]['stem'] = list(map(lambda x: x.replace('л', 'ԓ'), chunks[i]['stem']))
chunks[i]['stem'] = list(map(lambda x: x.replace("'", 'ʼ'), chunks[i]['stem']))
chunks[i]['stem'] = list(map(lambda x: x[:-1] if x[-1]=='ы' else x, chunks[i]['stem']))
In [118]:
recessive = set('уюи')
for i in range(len(chunks)):
if 'э' in chunks[i]['stem'][0] and set(chunks[i]['stem'][0])&recessive:
chunks[i]['stem'] = list(map(lambda x: x.replace('э', '%{Æ%}') if set(x)&recessive else x.replace('а', '%{Æ%}'), chunks[i]['stem']))
if 'ԓе' in chunks[i]['stem'][0] and set(chunks[i]['stem'][0])&recessive:
chunks[i]['stem'] = list(map(lambda x: x.replace('е', '%{Æ%}') if set(x)&recessive else x.replace('я', '%{Æ%}'), chunks[i]['stem']))
In [119]:
folder = 'lexicons'
if not os.path.exists(os.path.join('..', folder)):
os.mkdir(os.path.join('..', folder))
dict_file = 'verbs.lexc'
with open(os.path.join('..', folder, dict_file), 'w') as f:
# f.write("""!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# !!! L E X I C O N !!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# """)
f.write("""
LEXICON Verbs
""")
In [120]:
to_write = []
for i in verbs:
x = chunks[i]
to_write.append('{}:{} V ; ! {}'.format(x['lex'], x['stem'][0], x['trans_ru']))
to_write.append('\n')
In [121]:
with open(os.path.join('..', folder, dict_file), 'a') as f:
f.write('\n'.join(to_write))
In [122]:
to_write = []
still_not = []
nr = set('нр')
tn = set('тн')
for i in twos+undefined+undef_twos:
x = chunks[i]
if len(x['stem'][0])-len(x['stem'][1])==1 and x['stem'][0][1]=='ы':
x['stem'] = list(map(lambda l: l[0]+'%{ы%}'+l[1:] if l[1]!='ы' else l[0]+'%{ы%}'+l[2:], x['stem']))
if set(map(lambda y: y[1], x['stem'])) == set('ыъ'):
x['stem'] = list(map(lambda l: l[0]+'%{ы%}'+l[2:], x['stem']))
if set(map(lambda y: y[0], x['stem'])) == nr:
x['stem'] = list(map(lambda l: '%{R%}'+l[1:], x['stem']))
if set(map(lambda y: y[0], x['stem'])) == tn:
x['stem'] = list(map(lambda l: 'т'+l[1:], x['stem']))
if len(set(x['stem'])) == 1:
to_write.append('{}:{} V ; ! {}'.format(x['lex'], x['stem'][0], x['trans_ru']))
else:
still_not.append(x)
print(len(to_write))
len(still_not)
Out[122]:
In [123]:
for x in still_not:
if 'ык' in list(map(lambda y: y[-2:], x['stem'])):
x['stem'] = list(map(lambda y: y.strip('ык'), x['stem']))
In [124]:
for x in still_not:
for end in 'ԓьэт тку ԓьат тко ыԓьэт ытку ыԓьат ытко ръу ръо ыръу ыръо мго ымго'.split():
if x['stem'][1]==x['stem'][0]+end:
x['stem'] = list(map(lambda y: y[:-len(end)] if y.endswith(end) else y, x['stem']))
In [ ]:
for x in still_not:
In [125]:
for x in still_not:
if len(set(x['stem']))==1:
to_write.append('{}:{} V ; ! {}'.format(x['lex'], x['stem'][0], x['trans_ru']))
In [126]:
for i in still_not[50:70]:
pprint(i)
print(len(set(i['stem'])))
In [16]:
to_write = []
for i in twos+undefined+undef_twos:
x = chunks[i]
for i in range(len(x['stem'])):
to_write.append('{}:{} V ; ! {}'.format(x['lex'], x['stem'][i], x['trans_ru']))
In [128]:
with open(os.path.join('..', folder, 'verbt.lexc'), 'a') as f:
f.write('\n'.join(to_write))
In [129]:
to_write = []
for x in still_not:
if len(set(x['stem']))!=1:
for i in set(x['stem']):
to_write.append('{}:{} V ; ! {}'.format(x['lex'], i, x['trans_ru']))
with open(os.path.join('..', folder, 'verbu.lexc'), 'w') as f:
f.write('\n'.join(to_write))
In [ ]: