notebook.community

Edit and run



In [1]:

    
import os, json
from pprint import pprint



In [2]:

    
with open('dictionary_all_unparsed.json') as f:
    chunks = json.load(f)



In [3]:

    
len(chunks)









    Out[3]:





6278



In [4]:

    
for ch in chunks:
    ch['stem'] = list(map(lambda x: x.strip('.,;*()[]:&?! '), ch['stem'].split('|')))
    ch['lex'] = ch['lex'].strip('.,;*()[]:&?! ')



In [24]:

    
poss = {}



In [35]:

    
advs = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'ADV':
        if len(chunks[i]['stem'])<2:
            if '/' not in ''.join(chunks[i]['stem']):
                advs.append(i)

poss['adv'] = advs
len(advs)









    Out[35]:





548



In [36]:

    
intjs = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'INTJ':
        if len(chunks[i]['stem'])<2:
            if '/' not in ''.join(chunks[i]['stem']):
                intjs.append(i)

poss['intj'] = intjs
len(intjs)









    Out[36]:





30



In [28]:

    
ptcp = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'PTCP':
        ptcp.append(i)

poss['ptcp'] = ptcp
len(ptcp)









    Out[28]:





2



In [37]:

    
conj = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'CONJ':
        if len(chunks[i]['stem'])<2:
            if '/' not in ''.join(chunks[i]['stem']):
                conj.append(i)

poss['conj'] = conj
len(conj)









    Out[37]:





19



In [38]:

    
part = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'PART':
        if len(chunks[i]['stem'])<2:
            if '/' not in ''.join(chunks[i]['stem']):
                part.append(i)

poss['part'] = part
len(part)









    Out[38]:





35



In [39]:

    
with open('misc.json', 'w') as f:
    json.dump(poss, f)



In [7]:

    
for i in range(len(chunks)):
    chunks[i]['lex'] = chunks[i]['lex'].replace('ң', 'ӈ')
    chunks[i]['lex'] = chunks[i]['lex'].replace('қ', 'ӄ')



In [41]:

    
folder = 'lexicons'
if not os.path.exists(os.path.join('..', folder)):
    os.mkdir(os.path.join('..', folder))

dict_file = 'misc.lexc'
with open(os.path.join('..', folder, dict_file), 'w') as f:
#     f.write("""!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# !!!                          L E X I C O N                                  !!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# """)
    f.write("""
LEXICON Adverbs 

""")



In [42]:

    
to_write = []
for i in poss['adv']:
    x = chunks[i]
    to_write.append('{0}:{0} ADV ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')


to_write.append("""
LEXICON Interjections 

""")
for i in poss['intj']:
    x = chunks[i]
    to_write.append('{0}:{0} INTJ ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')


to_write.append("""
LEXICON Participles 

""")
for i in poss['ptcp']:
    x = chunks[i]
    to_write.append('{0}:{0} PTCP ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')


to_write.append("""
LEXICON Conjunctions 

""")
for i in poss['conj']:
    x = chunks[i]
    to_write.append('{0}:{0} CONJ ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')


to_write.append("""
LEXICON Particles 

""")
for i in poss['part']:
    x = chunks[i]
    to_write.append('{0}:{0} PART ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')



In [43]:

    
with open(os.path.join('..', folder, dict_file), 'a') as f:
    f.write('\n'.join(to_write))



In [6]:

    
# newwww
adjs = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'A':
        if len(chunks[i]['stem'])<2:
            if '/' not in ''.join(chunks[i]['stem']):
                adjs.append(i)

len(adjs)









    Out[6]:





602



In [8]:

    
to_write = []

to_write.append("""
LEXICON Adjectives 

""")
for i in adjs:
    x = chunks[i]
    to_write.append('{0}:{0} ADJ ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')



In [9]:

    
with open('../lexicons/misc.lexc', 'a') as f:
    f.write('\n')
    f.write('\n'.join(to_write))



In [ ]: