chinking
is not that different from chunking
In [1]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from pprint import pprint
%matplotlib notebook
In [2]:
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2005-GWBush.txt')
In [3]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
In [4]:
words = []
for sentence in tokenized[:3]:
tuple_is = word_tokenize(sentence)
words.append(tuple_is)
print(words)
In [5]:
for word in words[:3]:
tagged = nltk.pos_tag(word)
chunkGram = r"""Chunk: {<.*>+}
}<VB.?|IN|DT|TO>+{
"""
## making a custom parser for chinking
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
chunked.draw()