In [1]:
# Tokenization is splitting the text/sentence into mutiple strings/words
# Use .word_tokenize() to tokenize by words, .sent_tokenize() to tokenize by sentences
In [2]:
import nltk
In [3]:
text1 = 'I am learning the Natural Language Processing.'
In [4]:
# using .word_tokenize() to tokenize the sentence by words
# Obs: it is split the string by spaces or paranthesis
text1_tokens = nltk.word_tokenize(text1)
In [5]:
# We got 8 word tokens here
text1_tokens
Out[5]:
In [6]:
sent1 = 'I am learning the Natural Language Processing. It is fun to Learn.'
In [7]:
# using .sent_tokenize() to tokenize the text by sentences
# Obs: it is split the string by paranthesis(after optional space)
sent1_tokens = nltk.sent_tokenize(sent1)
In [8]:
# We got 2 sentence tokens here
sent1_tokens
Out[8]:
In [9]:
# We can loop through the sent tokens and generate the word tokens for each sent token
for sent_tkn in sent1_tokens:
word_tkns = nltk.word_tokenize(sent_tkn)
print(word_tkns)
In [ ]: