chinking is not that different from chunking


In [1]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from pprint import pprint
%matplotlib notebook

In [2]:
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2005-GWBush.txt')

In [3]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [4]:
words = []
for sentence in tokenized[:3]:
    tuple_is = word_tokenize(sentence)
    words.append(tuple_is)
print(words)


[['PRESIDENT', 'GEORGE', 'W.', 'BUSH', "'S", 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION', 'OF', 'THE', 'CONGRESS', 'ON', 'THE', 'STATE', 'OF', 'THE', 'UNION', 'February', '2', ',', '2005', '9:10', 'P.M.', 'EST', 'THE', 'PRESIDENT', ':', 'Mr.', 'Speaker', ',', 'Vice', 'President', 'Cheney', ',', 'members', 'of', 'Congress', ',', 'fellow', 'citizens', ':', 'As', 'a', 'new', 'Congress', 'gathers', ',', 'all', 'of', 'us', 'in', 'the', 'elected', 'branches', 'of', 'government', 'share', 'a', 'great', 'privilege', ':', 'We', "'ve", 'been', 'placed', 'in', 'office', 'by', 'the', 'votes', 'of', 'the', 'people', 'we', 'serve', '.'], ['And', 'tonight', 'that', 'is', 'a', 'privilege', 'we', 'share', 'with', 'newly-elected', 'leaders', 'of', 'Afghanistan', ',', 'the', 'Palestinian', 'Territories', ',', 'Ukraine', ',', 'and', 'a', 'free', 'and', 'sovereign', 'Iraq', '.'], ['(', 'Applause', '.', ')']]

Chunking


In [5]:
for word in words[:3]:
    tagged = nltk.pos_tag(word)
    chunkGram = r"""Chunk: {<.*>+}
                            }<VB.?|IN|DT|TO>+{
                            """
    ## making a custom parser for chinking
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)
    
    chunked.draw()