Named entity recognition


In [1]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from pprint import pprint
%matplotlib notebook

In [2]:
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2005-GWBush.txt')

In [3]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [4]:
words = []
for sentence in tokenized[:3]:
    tuple_is = word_tokenize(sentence)
    words.append(tuple_is)
print(words)


[['PRESIDENT', 'GEORGE', 'W.', 'BUSH', "'S", 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION', 'OF', 'THE', 'CONGRESS', 'ON', 'THE', 'STATE', 'OF', 'THE', 'UNION', 'February', '2', ',', '2005', '9:10', 'P.M.', 'EST', 'THE', 'PRESIDENT', ':', 'Mr.', 'Speaker', ',', 'Vice', 'President', 'Cheney', ',', 'members', 'of', 'Congress', ',', 'fellow', 'citizens', ':', 'As', 'a', 'new', 'Congress', 'gathers', ',', 'all', 'of', 'us', 'in', 'the', 'elected', 'branches', 'of', 'government', 'share', 'a', 'great', 'privilege', ':', 'We', "'ve", 'been', 'placed', 'in', 'office', 'by', 'the', 'votes', 'of', 'the', 'people', 'we', 'serve', '.'], ['And', 'tonight', 'that', 'is', 'a', 'privilege', 'we', 'share', 'with', 'newly-elected', 'leaders', 'of', 'Afghanistan', ',', 'the', 'Palestinian', 'Territories', ',', 'Ukraine', ',', 'and', 'a', 'free', 'and', 'sovereign', 'Iraq', '.'], ['(', 'Applause', '.', ')']]

Doing what we do best


In [5]:
for word in words[:3]:
    tagged = nltk.pos_tag(word)
    named_entity = nltk.ne_chunk(tagged)
    ## named_entity = nltk.ne_chunk(tagged, binary=True)    
    """binary=True gives us the named entity recognition without specifying the type like, organization, time, date
    money, percent facilty, etc.
    """
    named_entity.draw()