NLTK experiments

based on NLTK with Python 3 for Natural Language Processing by Sentdex


In [1]:
import nltk
from nltk import tokenize

In [2]:
# TODO: we don't relly want to download packages each time when we lauch this script
# so it'll better to check somehow whether we have packages or not - or Download on demand

# nltk.download()

In [3]:
example = 'Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. ' \
          'The sky is pinkish-blue. You shouldn\'t eat cardboard.'

In [4]:
tokenize.sent_tokenize(example)


Out[4]:
['Hello Mr. Smith, how are you doing today?',
 'The weather is great, and Python is awesome.',
 'The sky is pinkish-blue.',
 "You shouldn't eat cardboard."]

In [5]:
tokenize.word_tokenize(example)


Out[5]:
['Hello',
 'Mr.',
 'Smith',
 ',',
 'how',
 'are',
 'you',
 'doing',
 'today',
 '?',
 'The',
 'weather',
 'is',
 'great',
 ',',
 'and',
 'Python',
 'is',
 'awesome',
 '.',
 'The',
 'sky',
 'is',
 'pinkish-blue',
 '.',
 'You',
 'should',
 "n't",
 'eat',
 'cardboard',
 '.']

Stop words


In [6]:
from nltk import corpus, tokenize

In [7]:
example_sentence = 'This is a sample sentence, showing off the stop words filtration.'
stop_words = set(corpus.stopwords.words('english'))
words = tokenize.word_tokenize(example_sentence)
filtered_sentence = [w for w in words if w not in stop_words]
print(filtered_sentence)


['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']

Stemming


In [8]:
from nltk import stem, tokenize

In [9]:
ps = stem.PorterStemmer()
example_words = ['python', 'pythoner', 'pythoning', 'pythoned', 'pythonly', 'pythonic', 'pythonista']

In [10]:
['{} --> {}'.format(w, ps.stem(w)) for w in example_words]


Out[10]:
['python --> python',
 'pythoner --> python',
 'pythoning --> python',
 'pythoned --> python',
 'pythonly --> pythonli',
 'pythonic --> python',
 'pythonista --> pythonista']

In [11]:
example_text = 'It is important to by very pythonly while you are pythoning with python. '\
               'All pythoners have pythoned poorly at least once.'

In [12]:
['{} --> {}'.format(w, ps.stem(w)) for w in tokenize.word_tokenize(example_text)]


Out[12]:
['It --> It',
 'is --> is',
 'important --> import',
 'to --> to',
 'by --> by',
 'very --> veri',
 'pythonly --> pythonli',
 'while --> while',
 'you --> you',
 'are --> are',
 'pythoning --> python',
 'with --> with',
 'python --> python',
 '. --> .',
 'All --> all',
 'pythoners --> python',
 'have --> have',
 'pythoned --> python',
 'poorly --> poorli',
 'at --> at',
 'least --> least',
 'once --> onc',
 '. --> .']

Part of Speech Tagging

POS tag list:

CC  coordinating conjunction
CD  cardinal digit
DT  determiner
EX  existential there (like: "there is" ... think of it like "there exists")
FW  foreign word
IN  preposition/subordinating conjunction
JJ  adjective   'big'
JJR adjective, comparative  'bigger'
JJS adjective, superlative  'biggest'
LS  list marker 1)
MD  modal   could, will
NN  noun, singular 'desk'
NNS noun plural 'desks'
NNP proper noun, singular   'Harrison'
NNPS    proper noun, plural 'Americans'
PDT predeterminer   'all tdhe kids'
POS possessive ending   parent's
PRP personal pronoundß  I, he, she
PRP$    possessive pronoun  my, his, hers
RB  adverb  very, silently,
RBR adverb, comparative better
RBS adverb, superlative best
RP  particle    give up
TO  to  go 'to' the store.
UH  interjection    errrrrrrrm
VB  verb, base form take
VBD verb, past tense    took
VBG verb, gerund/present participle taking
VBN verb, past participle   taken
VBP verb, sing. present, non-3d take
VBZ verb, 3rd person sing. present  takes
WDT wh-determiner   which
WP  wh-pronoun  who, what
WP$ possessive wh-pronoun   whose
WRB wh-abverb   where, when

In [13]:
import nltk
from nltk import corpus, tokenize

In [14]:
train_text = corpus.state_union.raw('2005-GWBush.txt')
sample_text = corpus.state_union.raw('2006-GWBush.txt')

In [15]:
# Map tag to description, useful for annotations
tag_to_description = {
    'CC': 'coordinating conjunction',
    'CD': 'cardinal digit',
    'DT': 'determiner',
    'EX': 'existential there (like: "there is" ... think of it like "there exists")',
    'FW': 'foreign word',
    'IN': 'preposition/subordinating conjunction',
    'JJ': 'adjective	"big"',
    'JJR': 'adjective, comparative	"bigger"',
    'JJS': 'adjective, superlative	"biggest"',
    'LS': 'list marker	1)',
    'MD': 'modal	could, will',
    'NN': 'noun, singular "desk"',
    'NNS': 'noun plural	"desks"',
    'NNP': 'proper noun, singular	"Harrison"',
    'NNPS': 'proper noun, plural	"Americans"',
    'PDT': 'predeterminer	"all tdhe kids"',
    'POS': 'possessive ending	parent"s',
    'PRP': 'personal pronoundß	I, he, she',
    'PRP$': 'possessive pronoun	my, his, hers',
    'RB': 'adverb	very, silently,',
    'RBR': 'adverb, comparative	better',
    'RBS': 'adverb, superlative	best',
    'RP': 'particle	give up',
    'TO': 'to	go "to" the store.',
    'UH': 'interjection	errrrrrrrm',
    'VB': 'verb, base form	take',
    'VBD': 'verb, past tense	took',
    'VBG': 'verb, gerund/present participle	taking',
    'VBN': 'verb, past participle	taken',
    'VBP': 'verb, sing. present, non-3d	take',
    'VBZ': 'verb, 3rd person sing. present	takes',
    'WDT': 'wh-determiner	which',
    'WP': 'wh-pronoun	who, what',
    'WP$': 'possessive wh-pronoun	whose',
    'WRB': 'wh-abverb	where, when',
}

In [16]:
from collections import Counter
from operator import itemgetter, attrgetter


custom_sent_tokenizer = tokenize.PunktSentenceTokenizer(train_text)
tokenized_text = custom_sent_tokenizer.tokenize(sample_text)

total_counts = Counter()
for i in tokenized_text[:5]:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    print('# Sentence:')
    print(i)
    print('# Words:')
    print(words)
    print('# Tagged:')
    print(tagged)
    counts = Counter(tag for word, tag in tagged)
    total_counts += counts
    print('\n')

total = sum(total_counts.values())
freq = dict((word, float(count) / total) for word, count in sorted(total_counts.items()))
print('# Counts:')
print('\n\n-----\n\n'.join(['{}\n[{}] {}'.format(f, tag, tag_to_description.get(tag, tag)) for tag, f in sorted(freq.items(), key=itemgetter(1), reverse=True)]))


# Sentence:
PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
January 31, 2006

THE PRESIDENT: Thank you all.
# Words:
['PRESIDENT', 'GEORGE', 'W.', 'BUSH', "'S", 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION', 'OF', 'THE', 'CONGRESS', 'ON', 'THE', 'STATE', 'OF', 'THE', 'UNION', 'January', '31', ',', '2006', 'THE', 'PRESIDENT', ':', 'Thank', 'you', 'all', '.']
# Tagged:
[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]


# Sentence:
Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.
# Words:
['Mr.', 'Speaker', ',', 'Vice', 'President', 'Cheney', ',', 'members', 'of', 'Congress', ',', 'members', 'of', 'the', 'Supreme', 'Court', 'and', 'diplomatic', 'corps', ',', 'distinguished', 'guests', ',', 'and', 'fellow', 'citizens', ':', 'Today', 'our', 'nation', 'lost', 'a', 'beloved', ',', 'graceful', ',', 'courageous', 'woman', 'who', 'called', 'America', 'to', 'its', 'founding', 'ideals', 'and', 'carried', 'on', 'a', 'noble', 'dream', '.']
# Tagged:
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nation', 'NN'), ('lost', 'VBD'), ('a', 'DT'), ('beloved', 'VBN'), (',', ','), ('graceful', 'JJ'), (',', ','), ('courageous', 'JJ'), ('woman', 'NN'), ('who', 'WP'), ('called', 'VBD'), ('America', 'NNP'), ('to', 'TO'), ('its', 'PRP$'), ('founding', 'NN'), ('ideals', 'NNS'), ('and', 'CC'), ('carried', 'VBD'), ('on', 'IN'), ('a', 'DT'), ('noble', 'JJ'), ('dream', 'NN'), ('.', '.')]


# Sentence:
Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.
# Words:
['Tonight', 'we', 'are', 'comforted', 'by', 'the', 'hope', 'of', 'a', 'glad', 'reunion', 'with', 'the', 'husband', 'who', 'was', 'taken', 'so', 'long', 'ago', ',', 'and', 'we', 'are', 'grateful', 'for', 'the', 'good', 'life', 'of', 'Coretta', 'Scott', 'King', '.']
# Tagged:
[('Tonight', 'NN'), ('we', 'PRP'), ('are', 'VBP'), ('comforted', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('hope', 'NN'), ('of', 'IN'), ('a', 'DT'), ('glad', 'JJ'), ('reunion', 'NN'), ('with', 'IN'), ('the', 'DT'), ('husband', 'NN'), ('who', 'WP'), ('was', 'VBD'), ('taken', 'VBN'), ('so', 'RB'), ('long', 'RB'), ('ago', 'RB'), (',', ','), ('and', 'CC'), ('we', 'PRP'), ('are', 'VBP'), ('grateful', 'JJ'), ('for', 'IN'), ('the', 'DT'), ('good', 'JJ'), ('life', 'NN'), ('of', 'IN'), ('Coretta', 'NNP'), ('Scott', 'NNP'), ('King', 'NNP'), ('.', '.')]


# Sentence:
(Applause.)
# Words:
['(', 'Applause', '.', ')']
# Tagged:
[('(', '('), ('Applause', 'NNP'), ('.', '.'), (')', ')')]


# Sentence:
President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.
# Words:
['President', 'George', 'W.', 'Bush', 'reacts', 'to', 'applause', 'during', 'his', 'State', 'of', 'the', 'Union', 'Address', 'at', 'the', 'Capitol', ',', 'Tuesday', ',', 'Jan', '.']
# Tagged:
[('President', 'NNP'), ('George', 'NNP'), ('W.', 'NNP'), ('Bush', 'NNP'), ('reacts', 'VBZ'), ('to', 'TO'), ('applause', 'VB'), ('during', 'IN'), ('his', 'PRP$'), ('State', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('Union', 'NNP'), ('Address', 'NNP'), ('at', 'IN'), ('the', 'DT'), ('Capitol', 'NNP'), (',', ','), ('Tuesday', 'NNP'), (',', ','), ('Jan', 'NNP'), ('.', '.')]


# Counts:
0.29577464788732394
[NNP] proper noun, singular	"Harrison"

-----

0.09859154929577464
[IN] preposition/subordinating conjunction

-----

0.07746478873239436
[,] ,

-----

0.07042253521126761
[DT] determiner

-----

0.07042253521126761
[NN] noun, singular "desk"

-----

0.06338028169014084
[JJ] adjective	"big"

-----

0.035211267605633804
[.] .

-----

0.035211267605633804
[NNS] noun plural	"desks"

-----

0.028169014084507043
[CC] coordinating conjunction

-----

0.028169014084507043
[VBD] verb, past tense	took

-----

0.02112676056338028
[PRP] personal pronoundß	I, he, she

-----

0.02112676056338028
[PRP$] possessive pronoun	my, his, hers

-----

0.02112676056338028
[RB] adverb	very, silently,

-----

0.02112676056338028
[VBN] verb, past participle	taken

-----

0.014084507042253521
[:] :

-----

0.014084507042253521
[CD] cardinal digit

-----

0.014084507042253521
[TO] to	go "to" the store.

-----

0.014084507042253521
[VB] verb, base form	take

-----

0.014084507042253521
[VBP] verb, sing. present, non-3d	take

-----

0.014084507042253521
[WP] wh-pronoun	who, what

-----

0.007042253521126761
[(] (

-----

0.007042253521126761
[)] )

-----

0.007042253521126761
[POS] possessive ending	parent"s

-----

0.007042253521126761
[VBZ] verb, 3rd person sing. present	takes

Chunking


In [17]:
%matplotlib inline

In [18]:
import matplotlib as mpl
import matplotlib.pyplot as plt


chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunkGram)

for i in tokenized_text[:5]:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    chunked = chunkParser.parse(tagged)
    # TODO: should fix it
    # I'm using jupyter inside of Docker so maybe it is reason why doesn't work :(
    # I've found this one https://stackoverflow.com/questions/31779707/how-do-you-make-nltk-draw-trees-that-are-inline-in-ipython-jupyter
    # but haven't checkit it yet.
    chunked.draw()


---------------------------------------------------------------------------
TclError                                  Traceback (most recent call last)
<ipython-input-18-7f0485b269d0> in <module>()
     11     chunked = chunkParser.parse(tagged)
     12     # TODO: fix it
---> 13     chunked.draw()

/opt/conda/lib/python3.6/site-packages/nltk/tree.py in draw(self)
    688         """
    689         from nltk.draw.tree import draw_trees
--> 690         draw_trees(self)
    691 
    692     def pretty_print(self, sentence=None, highlight=(), stream=None, **kwargs):

/opt/conda/lib/python3.6/site-packages/nltk/draw/tree.py in draw_trees(*trees)
    861     :rtype: None
    862     """
--> 863     TreeView(*trees).mainloop()
    864     return
    865 

/opt/conda/lib/python3.6/site-packages/nltk/draw/tree.py in __init__(self, *trees)
    754         self._trees = trees
    755 
--> 756         self._top = Tk()
    757         self._top.title('NLTK')
    758         self._top.bind('<Control-x>', self.destroy)

/opt/conda/lib/python3.6/tkinter/__init__.py in __init__(self, screenName, baseName, className, useTk, sync, use)
   2015                 baseName = baseName + ext
   2016         interactive = 0
-> 2017         self.tk = _tkinter.create(screenName, baseName, className, interactive, wantobjects, useTk, sync, use)
   2018         if useTk:
   2019             self._loadtk()

TclError: no display name and no $DISPLAY environment variable