In [6]:
# --------------------------------------------------------------------------------------- +
# NLTK = Natural Language Toolkit
# --------------------------------------------------------------------------------------- +
# https://www.nltk.org/index.html
# https://www.nltk.org/book
# Ref. Coursera Tutorial : Applied TextMining in Python, Univ. of Michigan.
# --------------------------------------------------------------------------------------- +
import nltk
In [68]:
# --------------------------------------------------------------------------------------- +
# export NLTK_DATA=
# --------------------------------------------------------------------------------------- +
nltk.download()
Out[68]:
In [8]:
from nltk.book import *
In [20]:
text7
Out[20]:
In [21]:
sents()
In [22]:
sent1
Out[22]:
In [10]:
# --------------------------------------------------------------------------------------- + 1
# https://en.wikipedia.org/wiki/Brown_Corpus
# https://en.wikipedia.org/wiki/Text_corpus
# --------------------------------------------------------------------------------------- +
from nltk.corpus import brown
In [11]:
# https://www.nltk.org/book/ch02.html
brown.categories()
Out[11]:
In [12]:
brown.words(categories='fiction')
Out[12]:
In [13]:
brown.words(categories=['fiction', 'news'])
Out[13]:
In [14]:
# --------------------------------------------------------------------------------------- + 2
# Inaugral Corpus
# --------------------------------------------------------------------------------------- +
from nltk.corpus import inaugural
inaugural.fileids()
Out[14]:
In [15]:
# --------------------------------------------------------------------------------------- +
## 1) CONDITIONAL FREQUENCY DISTRIBUTION
# --------------------------------------------------------------------------------------- +
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target))
In [19]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(30,20))
cfd.plot()
In [23]:
# --------------------------------------------------------------------------------------- +
## 2) COUNTING VOCABULARY OF WORDS
# --------------------------------------------------------------------------------------- +
text7
Out[23]:
In [24]:
sent7
Out[24]:
In [25]:
len(sent7)
Out[25]:
In [26]:
len(text7)
Out[26]:
In [27]:
# --------------------------------------------------------------------------------------- +
## 3) UNIQUE WORDS, FREQUENCY of WORDS
# --------------------------------------------------------------------------------------- +
len(set(text7))
Out[27]:
In [28]:
# first 10 unique words
list(set(text7))[:10] # utf8 strings
Out[28]:
In [29]:
# frequency of words
dist = FreqDist(text7)
len(dist)
Out[29]:
In [37]:
# how many times a word occurs > 100 times, of length > 5
vocab1 = dist.keys()
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]
freqwords
Out[37]:
In [38]:
# --------------------------------------------------------------------------------------- +
## 4) NORMALIZATION and STEMMING
# --------------------------------------------------------------------------------------- +
# - different forms of the same word
# - normalization
input1 = "List listed lists listing litings"
words1 = input1.lower().split(' ')
words1
Out[38]:
In [41]:
# - stemming, root form of word
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]
Out[41]:
In [42]:
# --------------------------------------------------------------------------------------- +
## 5) LEMMATIZATION
# --------------------------------------------------------------------------------------- +
# - extract meaningful words
# - udhr = universal declaration of human rights
udhr = nltk.corpus.udhr.words('English-Latin1')
udhr[:20]
Out[42]:
In [43]:
[porter.stem(t) for t in udhr[:20]]
Out[43]:
In [45]:
# Lemmatization : Stemming but resulting stems are all valid words
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]
Out[45]:
In [46]:
# --------------------------------------------------------------------------------------- +
## 6) Tokenization - splitting a sentence into words or tokens
# --------------------------------------------------------------------------------------- +
text11 = "Children shouldn't drink a sugary drink before bed."
text11.split(' ')
Out[46]:
In [47]:
# 8 words > 'bed.'' has a dot attached to it
# use NLTK in-built tokenizer
nltk.word_tokenize(text11)
Out[47]:
In [48]:
# ... now get rid of .
# ... n't > represents negation > useful feature
# --------------------------------------------------------------------------------------- +
## 7) Sentence Splitting / boundaries
# --------------------------------------------------------------------------------------- +
text12 = "This is the first sentence. A gallon of milk is in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
sentences = nltk.sent_tokenize(text12)
len(sentences)
Out[48]:
In [49]:
sentences
Out[49]:
In [50]:
# end. Ref. Coursera Applied Text Mining in Python @ Univ. of Michigan
In [52]:
# --------------------------------------------------------------------------------------- +
## 8) Part-of-Speech (POS) Tagging
# --------------------------------------------------------------------------------------- +
nltk.help.upenn_tagset('MD')
In [53]:
# Steps : Tokenize > Tag
sentence = "At eight o'clock on Thursday morning Arthur didn't feel very good."
tokens = nltk.word_tokenize(sentence)
tokens
Out[53]:
In [55]:
tagged = nltk.pos_tag(tokens)
tagged
Out[55]:
In [59]:
nltk.help.upenn_tagset('CD')
In [65]:
# --------------------------------------------------------------------------------------- +
## 9) Parsing Sentence Structure
# --------------------------------------------------------------------------------------- +
text13 = nltk.word_tokenize("John eats Mango")
# create a context free grammar
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> 'John' | 'Mango'
V -> 'eats'
""")
parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text13)
for tree in trees:
print (tree)
In [69]:
# --------------------------------------------------------------------------------------- +
## 10) Parse Tree Collection
# --------------------------------------------------------------------------------------- +
from nltk.corpus import treebank
In [70]:
treebank.parsed_sents('wsj_0001.mrg')[0]
Out[70]:
In [ ]: