notebook.community

Edit and run



In [1]:

    
# Bigrams are the words that come next to each other or two consecutive words in a sentence
# Same as Bigrams, trigrams are those whose 3 words are consecutive in a sentnce
# Now for ngrams, those are n consecutive words in a given sentence



In [2]:

    
# Importing nltk
import nltk



In [3]:

    
# Taking the text/sentence to process
text1 = "I think it might rain today."



In [4]:

    
# Tokenizing all the words in the text
tokens = nltk.word_tokenize(text1)



In [5]:

    
# Getting the bigrams for the tokens
bigrams = nltk.bigrams(tokens)



In [6]:

    
# Iterating through bigrams and printing the same
# Obs: We can see that each item is a tuple with two strings, which are consecutive
for item in bigrams:
    print(item)









    



('I', 'think')
('think', 'it')
('it', 'might')
('might', 'rain')
('rain', 'today')
('today', '.')



In [7]:

    
# Now lets see the trigrams for same text, words of tokens
trigrams = nltk.trigrams(tokens)



In [8]:

    
# Iterating through the trigrams and printing the same
# Obs: We can see that each item is a tuple with 3 strings, which are consecutive
for item in trigrams:
    print(item)









    



('I', 'think', 'it')
('think', 'it', 'might')
('it', 'might', 'rain')
('might', 'rain', 'today')
('rain', 'today', '.')



In [9]:

    
# Now using the generic ngrams
from nltk.util import ngrams



In [10]:

    
# ngrams methods accepts tokens and number Ex: for bigrams-> 2, trigrams -> 3
# getting bigrams from ngrams
bigrams = ngrams(tokens, 2)



In [11]:

    
for item in bigrams:
    print(item)









    



('I', 'think')
('think', 'it')
('it', 'might')
('might', 'rain')
('rain', 'today')
('today', '.')



In [12]:

    
# getting trigrams from ngrams
trigrams = ngrams(tokens, 3)



In [13]:

    
for item in trigrams:
    print(item)









    



('I', 'think', 'it')
('think', 'it', 'might')
('it', 'might', 'rain')
('might', 'rain', 'today')
('rain', 'today', '.')

Like this you can go from bigrams to ngrams, by giving numbers from 2 to n.