In [1]:
# Bigrams are the words that come next to each other or two consecutive words in a sentence
# Same as Bigrams, trigrams are those whose 3 words are consecutive in a sentnce
# Now for ngrams, those are n consecutive words in a given sentence
In [2]:
# Importing nltk
import nltk
In [3]:
# Taking the text/sentence to process
text1 = "I think it might rain today."
In [4]:
# Tokenizing all the words in the text
tokens = nltk.word_tokenize(text1)
In [5]:
# Getting the bigrams for the tokens
bigrams = nltk.bigrams(tokens)
In [6]:
# Iterating through bigrams and printing the same
# Obs: We can see that each item is a tuple with two strings, which are consecutive
for item in bigrams:
print(item)
In [7]:
# Now lets see the trigrams for same text, words of tokens
trigrams = nltk.trigrams(tokens)
In [8]:
# Iterating through the trigrams and printing the same
# Obs: We can see that each item is a tuple with 3 strings, which are consecutive
for item in trigrams:
print(item)
In [9]:
# Now using the generic ngrams
from nltk.util import ngrams
In [10]:
# ngrams methods accepts tokens and number Ex: for bigrams-> 2, trigrams -> 3
# getting bigrams from ngrams
bigrams = ngrams(tokens, 2)
In [11]:
for item in bigrams:
print(item)
In [12]:
# getting trigrams from ngrams
trigrams = ngrams(tokens, 3)
In [13]:
for item in trigrams:
print(item)
Like this you can go from bigrams to ngrams, by giving numbers from 2 to n.