In [2]:
RAW_DATA_PATH = 'raw.dat'
TRAIN_DATA_PATH = 'train.dat'
MODEL_PATH = 'model.dat'
In [3]:
tempate = '''# Unigram
U00:%x[-2,0]
U01:%x[-1,0]
U02:%x[0,0]
U03:%x[1,0]
U04:%x[2,0]
U05:%x[-1,0]/%x[0,0]
U06:%x[0,0]/%x[1,0]
'''
In [4]:
with open('template','w') as fp:
fp.write(tempate)
In [17]:
with open(RAW_DATA_PATH, 'r') as raw_fp:
with open(TRAIN_DATA_PATH, 'w') as train_fp:
for idx, line in enumerate(raw_fp.xreadlines()):
if not line or len(line.strip()) < 2:
continue
else:
line = line.decode('utf-8')
tokens = [token.strip().encode('utf-8') for token in line]
tokens_count = len(tokens)
tags = []
for idx, token in enumerate(tokens):
if not token:
continue
if idx < tokens_count - 1:
if not tokens[idx+1]:
tags.append((token,'B-S'))
continue
if idx > 0:
if 'W' in tags[-1][1]:
tags.append((token,'I-W'))
continue
if idx == tokens_count - 1:
if not tokens[idx-1]:
tags.append((token,'B-S'))
continue
tags.append((token,'B-W'))
for token, tag in tags:
train_fp.write('%s\t%s\n' % (token,tag))
train_fp.write('\n')
In [18]:
with open(TRAIN_DATA_PATH, 'r') as fp:
lines = fp.xreadlines()
iteration = 1
while iteration < 6:
print lines.next(),
iteration += 1
In [7]:
import os
cmd = 'crf_learn -c 1 template %s %s'% (TRAIN_DATA_PATH, MODEL_PATH)
print cmd
os.system(cmd)
Out[7]:
In [19]:
import CRFPP
def segment(sentence):
tagger = CRFPP.Tagger('-m %s -v 3 -n2' % MODEL_PATH)
for token in list(sentence):
tagger.add(token.encode('utf-8'))
tagger.parse()
result = []
for pos_idx, token in enumerate(sentence):
word = tagger.x(pos_idx, 0).decode('utf-8')
tag = tagger.y2(pos_idx)
result.append(word)
if 'B-S' == tag:
result.append(' ')
return ''.join(result)
In [9]:
print segment(u'유럽스타일양문형냉장고')
In [ ]: