In [3]:
!pip install nltk


Collecting nltk
  Downloading nltk-3.2.1.tar.gz (1.1MB)
    100% |████████████████████████████████| 1.1MB 396kB/s 
Building wheels for collected packages: nltk
  Running setup.py bdist_wheel for nltk ... - \ | / done
  Stored in directory: /Users/dwdcw/Library/Caches/pip/wheels/55/0b/ce/960dcdaec7c9af5b1f81d471a90c8dae88374386efe6e54a50
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.2.1

In [1]:
import nltk nltk.download()

In [2]:
#1. Tokenizing Text
text="Mary had a little lamb. Her fleece was white as snow"
from nltk.tokenize import word_tokenize, sent_tokenize #nltk.tokenize is module name and word_tokenize, sent_tokenize is function name
sents=sent_tokenize(text) #sent = sentence
print(sents) #list of sentences


['Mary had a little lamb.', 'Her fleece was white as snow']

In [13]:
words=[word_tokenize(sent) for sent in sents] #list of words from each sentence
print(words) #Sentences are tokenized into words(aka tokens)
#note that punctuation marks are treated as seperate tokens


[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow']]

In [4]:
#2. Removing Stop Words
from nltk.corpus import stopwords #importing a pre-defined collectino of stopwords provided by nltk.corpus
from string import punctuation #importing all punctuation marks from the string module
customStopWords=set(stopwords.words('english')+list(punctuation)) #stopwords.words('english') = list of stopwords in english
#Note that we've combined english stopwords and punctuation marks into a "set" as order doesn't really matter here

In [5]:
wordsWOStopwords=[word for word in word_tokenize(text) if word not in customStopWords] #Stopwords+punctuation removed
print(wordsWOStopwords) #words with out stopwords


['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']

In [6]:
#3. Constructing Bigrams = Any pair of words that exist consecutively
from nltk.collocations import * #import all functions from collocations module of nltk
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOStopwords) #we use a class called 'BigramCollocationFinder' which helps to construct bigrams from a list of words
sorted(finder.ngram_fd.items()) #now the finder object has "ngram_fd.items()" method to print out the bigram items it has constructed
#Below is the list of all bigrams(consecutive words) with their frequencies


Out[6]:
[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

In [7]:
#4. Stemming (Removing ends of words like close, closed, closely, closer => clos)
text2 = "Mary closed on closing night when she was in the mood to close." #word "close" occurs in different morphological forms
from nltk.stem.lancaster import LancasterStemmer #nltk.stem module has many stemming approaches. Here we use the LancasterStemmer class
st=LancasterStemmer() #Instantiate the class
stemmedWords=[st.stem(word) for word in word_tokenize(text2)] #Just call the stem(..) method
print(stemmedWords)


['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']

In [8]:
nltk.pos_tag(word_tokenize(text2)) #part-of-speech_tag
#NNP = Proper noun
#VBD = Verb
#PRP = Pronoun


Out[8]:
[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

In [6]:
#5. WordSense Disambiguation
from nltk.corpus import wordnet as wn #wordnet is like a lexicon(thesaurus)
for ss in wn.synsets('bass'): #each synset represents one single definition of a word
    print(ss,"===>", ss.definition())


Synset('bass.n.01') ===> the lowest part of the musical range
Synset('bass.n.02') ===> the lowest part in polyphonic music
Synset('bass.n.03') ===> an adult male singer with the lowest voice
Synset('sea_bass.n.01') ===> the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') ===> any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') ===> the lowest adult male singing voice
Synset('bass.n.07') ===> the member with the lowest range of a family of musical instruments
Synset('bass.n.08') ===> nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') ===> having or denoting a low vocal or instrumental range

In [9]:
from nltk.wsd import lesk  #lesk is a function for word-sense disambiguation
sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"),'bass') #asking the definition of 'bass' within the context of "Sing in a lower tone, along with the bass"
print(sense1,"===>", sense1.definition())
sense2 = lesk(word_tokenize("This sea bass was really hard to catch"),'bass')
print(sense2,"===>", sense2.definition())


Synset('bass.n.07') ===> the member with the lowest range of a family of musical instruments
Synset('sea_bass.n.01') ===> the lean flesh of a saltwater fish of the family Serranidae

In [ ]: