In [3]:
!pip install nltk
In [1]:
import nltk nltk.download()
In [2]:
#1. Tokenizing Text
text="Mary had a little lamb. Her fleece was white as snow"
from nltk.tokenize import word_tokenize, sent_tokenize #nltk.tokenize is module name and word_tokenize, sent_tokenize is function name
sents=sent_tokenize(text) #sent = sentence
print(sents) #list of sentences
In [13]:
words=[word_tokenize(sent) for sent in sents] #list of words from each sentence
print(words) #Sentences are tokenized into words(aka tokens)
#note that punctuation marks are treated as seperate tokens
In [4]:
#2. Removing Stop Words
from nltk.corpus import stopwords #importing a pre-defined collectino of stopwords provided by nltk.corpus
from string import punctuation #importing all punctuation marks from the string module
customStopWords=set(stopwords.words('english')+list(punctuation)) #stopwords.words('english') = list of stopwords in english
#Note that we've combined english stopwords and punctuation marks into a "set" as order doesn't really matter here
In [5]:
wordsWOStopwords=[word for word in word_tokenize(text) if word not in customStopWords] #Stopwords+punctuation removed
print(wordsWOStopwords) #words with out stopwords
In [6]:
#3. Constructing Bigrams = Any pair of words that exist consecutively
from nltk.collocations import * #import all functions from collocations module of nltk
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOStopwords) #we use a class called 'BigramCollocationFinder' which helps to construct bigrams from a list of words
sorted(finder.ngram_fd.items()) #now the finder object has "ngram_fd.items()" method to print out the bigram items it has constructed
#Below is the list of all bigrams(consecutive words) with their frequencies
Out[6]:
In [7]:
#4. Stemming (Removing ends of words like close, closed, closely, closer => clos)
text2 = "Mary closed on closing night when she was in the mood to close." #word "close" occurs in different morphological forms
from nltk.stem.lancaster import LancasterStemmer #nltk.stem module has many stemming approaches. Here we use the LancasterStemmer class
st=LancasterStemmer() #Instantiate the class
stemmedWords=[st.stem(word) for word in word_tokenize(text2)] #Just call the stem(..) method
print(stemmedWords)
In [8]:
nltk.pos_tag(word_tokenize(text2)) #part-of-speech_tag
#NNP = Proper noun
#VBD = Verb
#PRP = Pronoun
Out[8]:
In [6]:
#5. WordSense Disambiguation
from nltk.corpus import wordnet as wn #wordnet is like a lexicon(thesaurus)
for ss in wn.synsets('bass'): #each synset represents one single definition of a word
print(ss,"===>", ss.definition())
In [9]:
from nltk.wsd import lesk #lesk is a function for word-sense disambiguation
sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"),'bass') #asking the definition of 'bass' within the context of "Sing in a lower tone, along with the bass"
print(sense1,"===>", sense1.definition())
sense2 = lesk(word_tokenize("This sea bass was really hard to catch"),'bass')
print(sense2,"===>", sense2.definition())
In [ ]: