notebook.community

Edit and run



In [12]:

    
import re



In [51]:

    
seq = u"「」"
mtxt ="""
如：「一則新聞」、「一則廣告」。
"""



In [52]:

    
def split_sentence(s):
    u'''
    >>> map(len, split_sentence(u"架子，放置器物的木器。木架上分不同形狀的許多層小格，格內可放入各種器皿、用具。儒林外史˙第二十三回：「又走進去，大殿上{[8e50]}子倒的七橫八豎。」紅樓夢˙第八十五回：「麝月便走去在裡間屋裡{[8e50]}子上頭拿了來。」亦作「格子」、「{[8e50]}子」。"))
    [11, 28, 37, 37, 19]
    >>> map(len, split_sentence(u"栝樓的別名。見「栝樓」條。"))
    [6, 7]
    >>> map(len, split_sentence(u"如：「一則新聞」、「一則廣告」。"))
    [16]
    '''
    sentences = []
    snt = ''
    wait = []
    pairs = {
            u'「': u'」',
            u'『': u'』',
            }
    for i, c in enumerate(s):
        snt += c
        if c in pairs:
            wait.append(pairs[c])
        if wait and wait[-1] == c:
            wait.pop()

        if not wait and any([
            re.search(ur'。$', snt),
            re.search(ur'：「.*」$', snt),
            ]) and (s[i+1:i+2] not in u'、。') and s[i+1:i+3] != u'句下':
            sentences.append(snt)
            snt = ''

    if wait:
        logging.warn('unbalance brances: %s' % s)
        raise UnbalanceBrances
    if snt:
        sentences.append(snt)

    return sentences



In [53]:

    
print "\n".join(split_sentence(mtxt.decode("utf8")))









    



如：「一則新聞」、「一則廣告」。



In [69]:

    
m = re.findall(r"「(\W+)」", mtxt)
print m[0]









    



一則新聞」、「一則廣告



In [73]:

    
for ele in mtxt.split('「'):
    if re.search('」', ele):
        print ele.split('」')[0]









    



一則新聞
一則廣告



In [ ]: