In [3]:
import pickle
import CaboCha
cabocha = CaboCha.Parser()
In [13]:
with open('neko_lattice.txt.cabocha', 'w') as f:
neko = "".join([i for i in open('neko.txt', 'r')])
tree = cabocha.parse(neko)
f.write(tree.toString(CaboCha.FORMAT_LATTICE))
In [14]:
class Morph(object):
def __init__(self, _surface, _base, _pos, _pos1):
self.surface = _surface
self.base = _base
self.pos = _pos
self.pos1 = _pos1
sentence_list = []
object_list = []
sentences = [i.strip('\n') for i in open('neko_lattice.txt.cabocha') \
if i[0] not in ('*', ' ')]
sentences = [i.split('\t') for i in sentences if i[0] not in ('EOS')]
surface = [s[0] for s in sentences]
attribute = [s[1].split(',') for s in sentences]
for i, j in zip(surface, attribute):
object_list.append(Morph(i, j[6], j[0], j[1]))
if i in ('。'):
sentence_list.append(object_list)
object_list = []
str.strip() -> 引数を与えないと言語の空白と\nを抜く
In [1]:
import xml.etree.ElementTree as ET
class Chunk(object):
"""
morph: list Morphオブジェクト
dst: int インデックス番号
srcs: .list インデックス番号
"""
def __init__(self, _morphs, _dst, _srcs):
self.morphs = _morphs
self.dst = _dst
self.srcs = _srcs
tree = ET.parse('neko_new.xml')
root = tree.getroot()
In [21]:
count = 0
for chunck in root.iter('chunk'):
count += 1
print(chunck.items())
if count == 5:
break
In [39]: