In [3]:
from konlpy.tag import Twitter
result = Twitter().pos("나무위키 말뭉치를 만들어보자")
for pos in result:
print(pos[0] + ' ' + pos[1])
In [9]:
def flat(content):
return ["{}/{}".format(word, tag) for word, tag in tagger.pos(content)]
In [13]:
tagged = flat(" 나무위키 말뭉치를 만들어보자")
In [14]:
' '.join(tagged)
Out[14]:
In [4]:
input_filename = '/Users/swkim/Data/namuwiki180326/_namuwiki_20180326_mini.txt'
output_filename = '/Users/swkim/Data/namuwiki180326/_namuwiki_20180326_mini_pos_tagged_corpus.txt'
In [5]:
tagger = Twitter()
In [ ]:
with open(output_filename, 'w', encoding='utf-8') as output_file:
for line in open(input_filename, 'r', encoding='utf-8'):
for sentence in line.split('.'):
tagged = flat(sentence)
if len(tagged) > 1:
a_line = ' '.join(tagged)
output_file.write(a_line + '\n')
In [ ]: