In [2]:
RAW_DATA_PATH = 'raw.dat'
TRAIN_DATA_PATH = 'train.dat'
MODEL_PATH = 'model.dat'

In [3]:
tempate = '''# Unigram
U00:%x[-2,0]
U01:%x[-1,0]
U02:%x[0,0]
U03:%x[1,0]
U04:%x[2,0]
U05:%x[-1,0]/%x[0,0]
U06:%x[0,0]/%x[1,0]
'''

In [4]:
with open('template','w') as fp:
     fp.write(tempate)

In [17]:
with open(RAW_DATA_PATH, 'r') as raw_fp:
    with open(TRAIN_DATA_PATH, 'w') as train_fp:
        for idx, line in enumerate(raw_fp.xreadlines()):
            if not line or len(line.strip()) < 2:
                continue
            else:
                line = line.decode('utf-8')

            tokens = [token.strip().encode('utf-8') for token in line]
            tokens_count = len(tokens)

            tags = []
            for idx, token in enumerate(tokens):
                if not token:
                    continue
                if idx < tokens_count - 1:
                    if not tokens[idx+1]:
                        tags.append((token,'B-S'))
                        continue
                if idx > 0:
                    if 'W' in tags[-1][1]:
                        tags.append((token,'I-W'))
                        continue
                if idx == tokens_count - 1:
                    if not tokens[idx-1]:
                        tags.append((token,'B-S'))
                        continue

                tags.append((token,'B-W'))

            for token, tag in tags:
                train_fp.write('%s\t%s\n' % (token,tag))
            train_fp.write('\n')

In [18]:
with open(TRAIN_DATA_PATH, 'r') as fp:
    lines = fp.xreadlines()
    iteration = 1
    while iteration < 6:
        print lines.next(),
        iteration += 1


1	B-W
%	B-S
시	B-W
스	I-W
템	B-S

In [7]:
import os

cmd = 'crf_learn -c 1 template %s %s'% (TRAIN_DATA_PATH, MODEL_PATH)
print cmd

os.system(cmd)


crf_learn -c 1 template train.dat crf.model
Out[7]:
0

In [19]:
import CRFPP

def segment(sentence):
    tagger = CRFPP.Tagger('-m  %s -v 3 -n2' % MODEL_PATH)

    for token in list(sentence):
        tagger.add(token.encode('utf-8'))
                
    tagger.parse()
    
    result = []
    for pos_idx, token in enumerate(sentence):
        word = tagger.x(pos_idx, 0).decode('utf-8')
        tag = tagger.y2(pos_idx)
        result.append(word)
        if 'B-S' == tag:
            result.append(' ')
        
    return ''.join(result)

In [9]:
print segment(u'유럽스타일양문형냉장고')


유럽 스타일 양문 형 냉장고 

In [ ]: