notebook.community

Edit and run



In [6]:

    
# --------------------------------------------------------------------------------------- +
# NLTK = Natural Language Toolkit
# --------------------------------------------------------------------------------------- +
# https://www.nltk.org/index.html
# https://www.nltk.org/book
# Ref. Coursera Tutorial : Applied TextMining in Python, Univ. of Michigan.
# --------------------------------------------------------------------------------------- +
import nltk



In [68]:

    
# --------------------------------------------------------------------------------------- +
# export NLTK_DATA=
# --------------------------------------------------------------------------------------- +
nltk.download()









    



showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml






    Out[68]:





True



In [8]:

    
from nltk.book import *



In [20]:

    
text7









    Out[20]:





<Text: Wall Street Journal>



In [21]:

    
sents()









    



sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .



In [22]:

    
sent1









    Out[22]:





['Call', 'me', 'Ishmael', '.']



In [10]:

    
# --------------------------------------------------------------------------------------- + 1
# https://en.wikipedia.org/wiki/Brown_Corpus
# https://en.wikipedia.org/wiki/Text_corpus
# --------------------------------------------------------------------------------------- +

from nltk.corpus import brown



In [11]:

    
# https://www.nltk.org/book/ch02.html

brown.categories()









    Out[11]:





['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']



In [12]:

    
brown.words(categories='fiction')









    Out[12]:





['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...]



In [13]:

    
brown.words(categories=['fiction', 'news'])









    Out[13]:





['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]



In [14]:

    
# --------------------------------------------------------------------------------------- + 2
# Inaugral Corpus
# --------------------------------------------------------------------------------------- +

from nltk.corpus import inaugural
inaugural.fileids()









    Out[14]:





['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1977-Carter.txt',
 '1981-Reagan.txt',
 '1985-Reagan.txt',
 '1989-Bush.txt',
 '1993-Clinton.txt',
 '1997-Clinton.txt',
 '2001-Bush.txt',
 '2005-Bush.txt',
 '2009-Obama.txt']



In [15]:

    
# --------------------------------------------------------------------------------------- +
## 1) CONDITIONAL FREQUENCY DISTRIBUTION
# --------------------------------------------------------------------------------------- +

cfd = nltk.ConditionalFreqDist(
	(target, fileid[:4])
	for fileid in inaugural.fileids()
	for w in inaugural.words(fileid)
	for target in ['america', 'citizen']
	if w.lower().startswith(target))



In [19]:

    
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(30,20))
cfd.plot()



In [23]:

    
# --------------------------------------------------------------------------------------- +
## 2) COUNTING VOCABULARY OF WORDS
# --------------------------------------------------------------------------------------- +

text7









    Out[23]:





<Text: Wall Street Journal>



In [24]:

    
sent7









    Out[24]:





['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']



In [25]:

    
len(sent7)









    Out[25]:





18



In [26]:

    
len(text7)









    Out[26]:





100676



In [27]:

    
# --------------------------------------------------------------------------------------- +
## 3) UNIQUE WORDS,  FREQUENCY of WORDS
# --------------------------------------------------------------------------------------- +

len(set(text7))









    Out[27]:





12408



In [28]:

    
# first 10 unique words

list(set(text7))[:10]  # utf8 strings









    Out[28]:





['NIH-appointed',
 '130.6',
 'crookery',
 'comparison',
 'District',
 '5,699',
 'scholars',
 'pins',
 'markup',
 'Alstyne']



In [29]:

    
# frequency of words
dist = FreqDist(text7)
len(dist)









    Out[29]:





12408



In [37]:

    
# how many times a word occurs > 100 times, of length > 5
vocab1 = dist.keys()
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]
freqwords









    Out[37]:





['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']



In [38]:

    
# --------------------------------------------------------------------------------------- +
## 4) NORMALIZATION and STEMMING
# --------------------------------------------------------------------------------------- +

# - different forms of the same word
# - normalization
input1 = "List listed lists listing litings"
words1 = input1.lower().split(' ')
words1









    Out[38]:





['list', 'listed', 'lists', 'listing', 'litings']



In [41]:

    
# - stemming, root form of word

porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]









    Out[41]:





['list', 'list', 'list', 'list', 'lite']



In [42]:

    
# --------------------------------------------------------------------------------------- +
## 5) LEMMATIZATION
# --------------------------------------------------------------------------------------- +

# - extract meaningful words
# - udhr = universal declaration of human rights

udhr = nltk.corpus.udhr.words('English-Latin1')
udhr[:20]









    Out[42]:





['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of']



In [43]:

    
[porter.stem(t) for t in udhr[:20]]









    Out[43]:





['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the',
 'inher',
 'digniti',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalien',
 'right',
 'of']



In [45]:

    
# Lemmatization : Stemming but resulting stems are all valid words
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]









    Out[45]:





['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'right',
 'of']



In [46]:

    
# --------------------------------------------------------------------------------------- +
## 6) Tokenization - splitting a sentence into words or tokens
# --------------------------------------------------------------------------------------- +

text11 = "Children shouldn't drink a sugary drink before bed."
text11.split(' ')









    Out[46]:





['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']



In [47]:

    
# 8 words > 'bed.'' has a dot attached to it

# use NLTK in-built tokenizer
nltk.word_tokenize(text11)









    Out[47]:





['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']



In [48]:

    
# ... now get rid of .
# ... n't > represents negation > useful feature
# --------------------------------------------------------------------------------------- +
## 7) Sentence Splitting / boundaries
# --------------------------------------------------------------------------------------- +

text12 = "This is the first sentence. A gallon of milk is in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"

sentences = nltk.sent_tokenize(text12)
len(sentences)









    Out[48]:





4



In [49]:

    
sentences









    Out[49]:





['This is the first sentence.',
 'A gallon of milk is in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']



In [50]:

    
# end. Ref. Coursera Applied Text Mining in Python @ Univ. of Michigan



In [52]:

    
# --------------------------------------------------------------------------------------- +
## 8) Part-of-Speech (POS) Tagging
# --------------------------------------------------------------------------------------- +

nltk.help.upenn_tagset('MD')









    



MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would



In [53]:

    
# Steps : Tokenize > Tag

sentence = "At eight o'clock on Thursday morning Arthur didn't feel very good."
tokens = nltk.word_tokenize(sentence)
tokens









    Out[53]:





['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']



In [55]:

    
tagged = nltk.pos_tag(tokens)
tagged









    Out[55]:





[('At', 'IN'),
 ('eight', 'CD'),
 ("o'clock", 'NN'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('morning', 'NN'),
 ('Arthur', 'NNP'),
 ('did', 'VBD'),
 ("n't", 'RB'),
 ('feel', 'VB'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('.', '.')]



In [59]:

    
nltk.help.upenn_tagset('CD')









    



CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...



In [65]:

    
# --------------------------------------------------------------------------------------- +
## 9) Parsing Sentence Structure
# --------------------------------------------------------------------------------------- +

text13 = nltk.word_tokenize("John eats Mango")

# create a context free grammar
grammar = nltk.CFG.fromstring("""
    S -> NP VP
    VP -> V NP
    NP -> 'John' | 'Mango'
    V -> 'eats'
""")

parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text13)
for tree in trees:
    print (tree)









    



(S (NP John) (VP (V eats) (NP Mango)))



In [69]:

    
# --------------------------------------------------------------------------------------- +
## 10) Parse Tree Collection
# --------------------------------------------------------------------------------------- +
from nltk.corpus import treebank



In [70]:

    
treebank.parsed_sents('wsj_0001.mrg')[0]









    



---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
~/anaconda3/envs/py36nltk/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
    343             method = get_real_method(obj, self.print_method)
    344             if method is not None:
--> 345                 return method()
    346             return None
    347         else:

~/anaconda3/envs/py36nltk/lib/python3.6/site-packages/nltk/tree.py in _repr_png_(self)
    728             _canvas_frame.print_to_file(in_path)
    729             _canvas_frame.destroy_widget(widget)
--> 730             subprocess.call([find_binary('gs', binary_names=['gswin32c.exe', 'gswin64c.exe'], env_vars=['PATH'], verbose=False)] +
    731                             '-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'
    732                             .format(out_path, in_path).split())

~/anaconda3/envs/py36nltk/lib/python3.6/site-packages/nltk/__init__.py in find_binary(name, path_to_bin, env_vars, searchpath, binary_names, url, verbose)
    602                 binary_names=None, url=None, verbose=False):
    603     return next(find_binary_iter(name, path_to_bin, env_vars, searchpath,
--> 604                                  binary_names, url, verbose))
    605 
    606 def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(),

~/anaconda3/envs/py36nltk/lib/python3.6/site-packages/nltk/__init__.py in find_binary_iter(name, path_to_bin, env_vars, searchpath, binary_names, url, verbose)
    596     """
    597     for file in  find_file_iter(path_to_bin or name, env_vars, searchpath, binary_names,
--> 598                      url, verbose):
    599         yield file
    600 

~/anaconda3/envs/py36nltk/lib/python3.6/site-packages/nltk/__init__.py in find_file_iter(filename, env_vars, searchpath, file_names, url, verbose, finding_dir)
    567                         (filename, url))
    568         div = '='*75
--> 569         raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
    570 
    571 

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================





    Out[70]:





Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])])



In [ ]: