In [1]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from pprint import pprint

In [2]:
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2005-GWBush.txt')

In [3]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [4]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)
print(tokenized[:3])


["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nFebruary 2, 2005\n\n\n9:10 P.M. EST \n\nTHE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: \n\nAs a new Congress gathers, all of us in the elected branches of government share a great privilege: We've been placed in office by the votes of the people we serve.", 'And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territories, Ukraine, and a free and sovereign Iraq.', '(Applause.)']

In [5]:
words = []
for sentence in tokenized[:3]:
    tuple_is = word_tokenize(sentence)
    words.append(tuple_is)
print(words)


[['PRESIDENT', 'GEORGE', 'W.', 'BUSH', "'S", 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION', 'OF', 'THE', 'CONGRESS', 'ON', 'THE', 'STATE', 'OF', 'THE', 'UNION', 'February', '2', ',', '2005', '9:10', 'P.M.', 'EST', 'THE', 'PRESIDENT', ':', 'Mr.', 'Speaker', ',', 'Vice', 'President', 'Cheney', ',', 'members', 'of', 'Congress', ',', 'fellow', 'citizens', ':', 'As', 'a', 'new', 'Congress', 'gathers', ',', 'all', 'of', 'us', 'in', 'the', 'elected', 'branches', 'of', 'government', 'share', 'a', 'great', 'privilege', ':', 'We', "'ve", 'been', 'placed', 'in', 'office', 'by', 'the', 'votes', 'of', 'the', 'people', 'we', 'serve', '.'], ['And', 'tonight', 'that', 'is', 'a', 'privilege', 'we', 'share', 'with', 'newly-elected', 'leaders', 'of', 'Afghanistan', ',', 'the', 'Palestinian', 'Territories', ',', 'Ukraine', ',', 'and', 'a', 'free', 'and', 'sovereign', 'Iraq', '.'], ['(', 'Applause', '.', ')']]

POS Tagging


In [6]:
for word in words:
    print(nltk.pos_tag(word))


[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('February', 'NNP'), ('2', 'CD'), (',', ','), ('2005', 'CD'), ('9:10', 'CD'), ('P.M.', 'NNP'), ('EST', 'NNP'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('As', 'IN'), ('a', 'DT'), ('new', 'JJ'), ('Congress', 'NNP'), ('gathers', 'NNS'), (',', ','), ('all', 'DT'), ('of', 'IN'), ('us', 'PRP'), ('in', 'IN'), ('the', 'DT'), ('elected', 'JJ'), ('branches', 'NNS'), ('of', 'IN'), ('government', 'NN'), ('share', 'NN'), ('a', 'DT'), ('great', 'JJ'), ('privilege', 'NN'), (':', ':'), ('We', 'PRP'), ("'ve", 'VBP'), ('been', 'VBN'), ('placed', 'VBN'), ('in', 'IN'), ('office', 'NN'), ('by', 'IN'), ('the', 'DT'), ('votes', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('people', 'NNS'), ('we', 'PRP'), ('serve', 'VBP'), ('.', '.')]
[('And', 'CC'), ('tonight', 'NN'), ('that', 'WDT'), ('is', 'VBZ'), ('a', 'DT'), ('privilege', 'NN'), ('we', 'PRP'), ('share', 'NN'), ('with', 'IN'), ('newly-elected', 'JJ'), ('leaders', 'NNS'), ('of', 'IN'), ('Afghanistan', 'NNP'), (',', ','), ('the', 'DT'), ('Palestinian', 'JJ'), ('Territories', 'NNP'), (',', ','), ('Ukraine', 'NNP'), (',', ','), ('and', 'CC'), ('a', 'DT'), ('free', 'JJ'), ('and', 'CC'), ('sovereign', 'JJ'), ('Iraq', 'NNP'), ('.', '.')]
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]