In [12]:
import re
In [51]:
seq = u"「」"
mtxt ="""
如:「一則新聞」、「一則廣告」。
"""
In [52]:
def split_sentence(s):
u'''
>>> map(len, split_sentence(u"架子,放置器物的木器。木架上分不同形狀的許多層小格,格內可放入各種器皿、用具。儒林外史˙第二十三回:「又走進去,大殿上{[8e50]}子倒的七橫八豎。」紅樓夢˙第八十五回:「麝月便走去在裡間屋裡{[8e50]}子上頭拿了來。」亦作「格子」、「{[8e50]}子」。"))
[11, 28, 37, 37, 19]
>>> map(len, split_sentence(u"栝樓的別名。見「栝樓」條。"))
[6, 7]
>>> map(len, split_sentence(u"如:「一則新聞」、「一則廣告」。"))
[16]
'''
sentences = []
snt = ''
wait = []
pairs = {
u'「': u'」',
u'『': u'』',
}
for i, c in enumerate(s):
snt += c
if c in pairs:
wait.append(pairs[c])
if wait and wait[-1] == c:
wait.pop()
if not wait and any([
re.search(ur'。$', snt),
re.search(ur':「.*」$', snt),
]) and (s[i+1:i+2] not in u'、。') and s[i+1:i+3] != u'句下':
sentences.append(snt)
snt = ''
if wait:
logging.warn('unbalance brances: %s' % s)
raise UnbalanceBrances
if snt:
sentences.append(snt)
return sentences
In [53]:
print "\n".join(split_sentence(mtxt.decode("utf8")))
In [69]:
m = re.findall(r"「(\W+)」", mtxt)
print m[0]
In [73]:
for ele in mtxt.split('「'):
if re.search('」', ele):
print ele.split('」')[0]
In [ ]: