In [1]:
import os, json
from pprint import pprint
In [2]:
with open('dictionary_all_unparsed.json') as f:
chunks = json.load(f)
In [3]:
len(chunks)
Out[3]:
In [4]:
for ch in chunks:
ch['stem'] = list(map(lambda x: x.strip('.,;*()[]:&?! '), ch['stem'].split('|')))
ch['lex'] = ch['lex'].strip('.,;*()[]:&?! ')
In [24]:
poss = {}
In [35]:
advs = []
for i in range(len(chunks)):
if chunks[i]['gramm'] == 'ADV':
if len(chunks[i]['stem'])<2:
if '/' not in ''.join(chunks[i]['stem']):
advs.append(i)
poss['adv'] = advs
len(advs)
Out[35]:
In [36]:
intjs = []
for i in range(len(chunks)):
if chunks[i]['gramm'] == 'INTJ':
if len(chunks[i]['stem'])<2:
if '/' not in ''.join(chunks[i]['stem']):
intjs.append(i)
poss['intj'] = intjs
len(intjs)
Out[36]:
In [28]:
ptcp = []
for i in range(len(chunks)):
if chunks[i]['gramm'] == 'PTCP':
ptcp.append(i)
poss['ptcp'] = ptcp
len(ptcp)
Out[28]:
In [37]:
conj = []
for i in range(len(chunks)):
if chunks[i]['gramm'] == 'CONJ':
if len(chunks[i]['stem'])<2:
if '/' not in ''.join(chunks[i]['stem']):
conj.append(i)
poss['conj'] = conj
len(conj)
Out[37]:
In [38]:
part = []
for i in range(len(chunks)):
if chunks[i]['gramm'] == 'PART':
if len(chunks[i]['stem'])<2:
if '/' not in ''.join(chunks[i]['stem']):
part.append(i)
poss['part'] = part
len(part)
Out[38]:
In [39]:
with open('misc.json', 'w') as f:
json.dump(poss, f)
In [7]:
for i in range(len(chunks)):
chunks[i]['lex'] = chunks[i]['lex'].replace('ң', 'ӈ')
chunks[i]['lex'] = chunks[i]['lex'].replace('қ', 'ӄ')
In [41]:
folder = 'lexicons'
if not os.path.exists(os.path.join('..', folder)):
os.mkdir(os.path.join('..', folder))
dict_file = 'misc.lexc'
with open(os.path.join('..', folder, dict_file), 'w') as f:
# f.write("""!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# !!! L E X I C O N !!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# """)
f.write("""
LEXICON Adverbs
""")
In [42]:
to_write = []
for i in poss['adv']:
x = chunks[i]
to_write.append('{0}:{0} ADV ; ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')
to_write.append("""
LEXICON Interjections
""")
for i in poss['intj']:
x = chunks[i]
to_write.append('{0}:{0} INTJ ; ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')
to_write.append("""
LEXICON Participles
""")
for i in poss['ptcp']:
x = chunks[i]
to_write.append('{0}:{0} PTCP ; ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')
to_write.append("""
LEXICON Conjunctions
""")
for i in poss['conj']:
x = chunks[i]
to_write.append('{0}:{0} CONJ ; ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')
to_write.append("""
LEXICON Particles
""")
for i in poss['part']:
x = chunks[i]
to_write.append('{0}:{0} PART ; ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')
In [43]:
with open(os.path.join('..', folder, dict_file), 'a') as f:
f.write('\n'.join(to_write))
In [6]:
# newwww
adjs = []
for i in range(len(chunks)):
if chunks[i]['gramm'] == 'A':
if len(chunks[i]['stem'])<2:
if '/' not in ''.join(chunks[i]['stem']):
adjs.append(i)
len(adjs)
Out[6]:
In [8]:
to_write = []
to_write.append("""
LEXICON Adjectives
""")
for i in adjs:
x = chunks[i]
to_write.append('{0}:{0} ADJ ; ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')
In [9]:
with open('../lexicons/misc.lexc', 'a') as f:
f.write('\n')
f.write('\n'.join(to_write))
In [ ]: