not_only_nouns-checkpoint



In [1]:
import os, json
from pprint import pprint

In [2]:
with open('dictionary_all_unparsed.json') as f:
    chunks = json.load(f)

In [3]:
len(chunks)


Out[3]:
6278

In [4]:
for ch in chunks:
    ch['stem'] = list(map(lambda x: x.strip('.,;*()[]:&?! '), ch['stem'].split('|')))
    ch['lex'] = ch['lex'].strip('.,;*()[]:&?! ')

In [24]:
poss = {}

In [35]:
advs = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'ADV':
        if len(chunks[i]['stem'])<2:
            if '/' not in ''.join(chunks[i]['stem']):
                advs.append(i)

poss['adv'] = advs
len(advs)


Out[35]:
548

In [36]:
intjs = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'INTJ':
        if len(chunks[i]['stem'])<2:
            if '/' not in ''.join(chunks[i]['stem']):
                intjs.append(i)

poss['intj'] = intjs
len(intjs)


Out[36]:
30

In [28]:
ptcp = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'PTCP':
        ptcp.append(i)

poss['ptcp'] = ptcp
len(ptcp)


Out[28]:
2

In [37]:
conj = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'CONJ':
        if len(chunks[i]['stem'])<2:
            if '/' not in ''.join(chunks[i]['stem']):
                conj.append(i)

poss['conj'] = conj
len(conj)


Out[37]:
19

In [38]:
part = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'PART':
        if len(chunks[i]['stem'])<2:
            if '/' not in ''.join(chunks[i]['stem']):
                part.append(i)

poss['part'] = part
len(part)


Out[38]:
35

In [39]:
with open('misc.json', 'w') as f:
    json.dump(poss, f)

In [7]:
for i in range(len(chunks)):
    chunks[i]['lex'] = chunks[i]['lex'].replace('ң', 'ӈ')
    chunks[i]['lex'] = chunks[i]['lex'].replace('қ', 'ӄ')

In [41]:
folder = 'lexicons'
if not os.path.exists(os.path.join('..', folder)):
    os.mkdir(os.path.join('..', folder))

dict_file = 'misc.lexc'
with open(os.path.join('..', folder, dict_file), 'w') as f:
#     f.write("""!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# !!!                          L E X I C O N                                  !!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# """)
    f.write("""
LEXICON Adverbs 

""")

In [42]:
to_write = []
for i in poss['adv']:
    x = chunks[i]
    to_write.append('{0}:{0} ADV ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')


to_write.append("""
LEXICON Interjections 

""")
for i in poss['intj']:
    x = chunks[i]
    to_write.append('{0}:{0} INTJ ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')


to_write.append("""
LEXICON Participles 

""")
for i in poss['ptcp']:
    x = chunks[i]
    to_write.append('{0}:{0} PTCP ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')


to_write.append("""
LEXICON Conjunctions 

""")
for i in poss['conj']:
    x = chunks[i]
    to_write.append('{0}:{0} CONJ ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')


to_write.append("""
LEXICON Particles 

""")
for i in poss['part']:
    x = chunks[i]
    to_write.append('{0}:{0} PART ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')

In [43]:
with open(os.path.join('..', folder, dict_file), 'a') as f:
    f.write('\n'.join(to_write))

In [6]:
# newwww
adjs = []
for i in range(len(chunks)):
    if chunks[i]['gramm'] == 'A':
        if len(chunks[i]['stem'])<2:
            if '/' not in ''.join(chunks[i]['stem']):
                adjs.append(i)

len(adjs)


Out[6]:
602

In [8]:
to_write = []

to_write.append("""
LEXICON Adjectives 

""")
for i in adjs:
    x = chunks[i]
    to_write.append('{0}:{0} ADJ ;    ! {1}'.format(x['lex'], x['trans_ru']))
to_write.append('\n')

In [9]:
with open('../lexicons/misc.lexc', 'a') as f:
    f.write('\n')
    f.write('\n'.join(to_write))

In [ ]: