In [6]:
# --------------------------------------------------------------------------------------- +
# NLTK = Natural Language Toolkit
# --------------------------------------------------------------------------------------- +
# https://www.nltk.org/index.html
# https://www.nltk.org/book
# Ref. Coursera Tutorial : Applied TextMining in Python, Univ. of Michigan.
# --------------------------------------------------------------------------------------- +
import nltk

In [68]:
# --------------------------------------------------------------------------------------- +
# export NLTK_DATA=
# --------------------------------------------------------------------------------------- +
nltk.download()


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
Out[68]:
True

In [8]:
from nltk.book import *

In [20]:
text7


Out[20]:
<Text: Wall Street Journal>

In [21]:
sents()


sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .

In [22]:
sent1


Out[22]:
['Call', 'me', 'Ishmael', '.']

In [10]:
# --------------------------------------------------------------------------------------- + 1
# https://en.wikipedia.org/wiki/Brown_Corpus
# https://en.wikipedia.org/wiki/Text_corpus
# --------------------------------------------------------------------------------------- +

from nltk.corpus import brown

In [11]:
# https://www.nltk.org/book/ch02.html

brown.categories()


Out[11]:
['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [12]:
brown.words(categories='fiction')


Out[12]:
['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...]

In [13]:
brown.words(categories=['fiction', 'news'])


Out[13]:
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [14]:
# --------------------------------------------------------------------------------------- + 2
# Inaugral Corpus
# --------------------------------------------------------------------------------------- +

from nltk.corpus import inaugural
inaugural.fileids()


Out[14]:
['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1977-Carter.txt',
 '1981-Reagan.txt',
 '1985-Reagan.txt',
 '1989-Bush.txt',
 '1993-Clinton.txt',
 '1997-Clinton.txt',
 '2001-Bush.txt',
 '2005-Bush.txt',
 '2009-Obama.txt']

In [15]:
# --------------------------------------------------------------------------------------- +
## 1) CONDITIONAL FREQUENCY DISTRIBUTION
# --------------------------------------------------------------------------------------- +

cfd = nltk.ConditionalFreqDist(
	(target, fileid[:4])
	for fileid in inaugural.fileids()
	for w in inaugural.words(fileid)
	for target in ['america', 'citizen']
	if w.lower().startswith(target))

In [19]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(30,20))
cfd.plot()



In [23]:
# --------------------------------------------------------------------------------------- +
## 2) COUNTING VOCABULARY OF WORDS
# --------------------------------------------------------------------------------------- +

text7


Out[23]:
<Text: Wall Street Journal>

In [24]:
sent7


Out[24]:
['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [25]:
len(sent7)


Out[25]:
18

In [26]:
len(text7)


Out[26]:
100676

In [27]:
# --------------------------------------------------------------------------------------- +
## 3) UNIQUE WORDS,  FREQUENCY of WORDS
# --------------------------------------------------------------------------------------- +

len(set(text7))


Out[27]:
12408

In [28]:
# first 10 unique words

list(set(text7))[:10]  # utf8 strings


Out[28]:
['NIH-appointed',
 '130.6',
 'crookery',
 'comparison',
 'District',
 '5,699',
 'scholars',
 'pins',
 'markup',
 'Alstyne']

In [29]:
# frequency of words
dist = FreqDist(text7)
len(dist)


Out[29]:
12408

In [37]:
# how many times a word occurs > 100 times, of length > 5
vocab1 = dist.keys()
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]
freqwords


Out[37]:
['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']

In [38]:
# --------------------------------------------------------------------------------------- +
## 4) NORMALIZATION and STEMMING
# --------------------------------------------------------------------------------------- +

# - different forms of the same word
# - normalization
input1 = "List listed lists listing litings"
words1 = input1.lower().split(' ')
words1


Out[38]:
['list', 'listed', 'lists', 'listing', 'litings']

In [41]:
# - stemming, root form of word

porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]


Out[41]:
['list', 'list', 'list', 'list', 'lite']

In [42]:
# --------------------------------------------------------------------------------------- +
## 5) LEMMATIZATION
# --------------------------------------------------------------------------------------- +

# - extract meaningful words
# - udhr = universal declaration of human rights

udhr = nltk.corpus.udhr.words('English-Latin1')
udhr[:20]


Out[42]:
['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of']

In [43]:
[porter.stem(t) for t in udhr[:20]]


Out[43]:
['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the',
 'inher',
 'digniti',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalien',
 'right',
 'of']

In [45]:
# Lemmatization : Stemming but resulting stems are all valid words
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]


Out[45]:
['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'right',
 'of']

In [46]:
# --------------------------------------------------------------------------------------- +
## 6) Tokenization - splitting a sentence into words or tokens
# --------------------------------------------------------------------------------------- +

text11 = "Children shouldn't drink a sugary drink before bed."
text11.split(' ')


Out[46]:
['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

In [47]:
# 8 words > 'bed.'' has a dot attached to it

# use NLTK in-built tokenizer
nltk.word_tokenize(text11)


Out[47]:
['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

In [48]:
# ... now get rid of .
# ... n't > represents negation > useful feature
# --------------------------------------------------------------------------------------- +
## 7) Sentence Splitting / boundaries
# --------------------------------------------------------------------------------------- +

text12 = "This is the first sentence. A gallon of milk is in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"

sentences = nltk.sent_tokenize(text12)
len(sentences)


Out[48]:
4

In [49]:
sentences


Out[49]:
['This is the first sentence.',
 'A gallon of milk is in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

In [50]:
# end. Ref. Coursera Applied Text Mining in Python @ Univ. of Michigan

In [52]:
# --------------------------------------------------------------------------------------- +
## 8) Part-of-Speech (POS) Tagging
# --------------------------------------------------------------------------------------- +

nltk.help.upenn_tagset('MD')


MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would

In [53]:
# Steps : Tokenize > Tag

sentence = "At eight o'clock on Thursday morning Arthur didn't feel very good."
tokens = nltk.word_tokenize(sentence)
tokens


Out[53]:
['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

In [55]:
tagged = nltk.pos_tag(tokens)
tagged


Out[55]:
[('At', 'IN'),
 ('eight', 'CD'),
 ("o'clock", 'NN'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('morning', 'NN'),
 ('Arthur', 'NNP'),
 ('did', 'VBD'),
 ("n't", 'RB'),
 ('feel', 'VB'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('.', '.')]

In [59]:
nltk.help.upenn_tagset('CD')


CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...

In [65]:
# --------------------------------------------------------------------------------------- +
## 9) Parsing Sentence Structure
# --------------------------------------------------------------------------------------- +

text13 = nltk.word_tokenize("John eats Mango")

# create a context free grammar
grammar = nltk.CFG.fromstring("""
    S -> NP VP
    VP -> V NP
    NP -> 'John' | 'Mango'
    V -> 'eats'
""")

parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text13)
for tree in trees:
    print (tree)


(S (NP John) (VP (V eats) (NP Mango)))

In [69]:
# --------------------------------------------------------------------------------------- +
## 10) Parse Tree Collection
# --------------------------------------------------------------------------------------- +
from nltk.corpus import treebank

In [70]:
treebank.parsed_sents('wsj_0001.mrg')[0]


---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
~/anaconda3/envs/py36nltk/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
    343             method = get_real_method(obj, self.print_method)
    344             if method is not None:
--> 345                 return method()
    346             return None
    347         else:

~/anaconda3/envs/py36nltk/lib/python3.6/site-packages/nltk/tree.py in _repr_png_(self)
    728             _canvas_frame.print_to_file(in_path)
    729             _canvas_frame.destroy_widget(widget)
--> 730             subprocess.call([find_binary('gs', binary_names=['gswin32c.exe', 'gswin64c.exe'], env_vars=['PATH'], verbose=False)] +
    731                             '-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'
    732                             .format(out_path, in_path).split())

~/anaconda3/envs/py36nltk/lib/python3.6/site-packages/nltk/__init__.py in find_binary(name, path_to_bin, env_vars, searchpath, binary_names, url, verbose)
    602                 binary_names=None, url=None, verbose=False):
    603     return next(find_binary_iter(name, path_to_bin, env_vars, searchpath,
--> 604                                  binary_names, url, verbose))
    605 
    606 def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),

~/anaconda3/envs/py36nltk/lib/python3.6/site-packages/nltk/__init__.py in find_binary_iter(name, path_to_bin, env_vars, searchpath, binary_names, url, verbose)
    596     """
    597     for file in  find_file_iter(path_to_bin or name, env_vars, searchpath, binary_names,
--> 598                      url, verbose):
    599         yield file
    600 

~/anaconda3/envs/py36nltk/lib/python3.6/site-packages/nltk/__init__.py in find_file_iter(filename, env_vars, searchpath, file_names, url, verbose, finding_dir)
    567                         (filename, url))
    568         div = '='*75
--> 569         raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
    570 
    571 

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================
Out[70]:
Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])])

In [ ]: