In [1]:
# Tokenization is splitting the text/sentence into mutiple strings/words
# Use .word_tokenize() to tokenize by words, .sent_tokenize() to tokenize by sentences

In [2]:
import nltk

In [3]:
text1 = 'I am learning the Natural Language Processing.'

In [4]:
# using .word_tokenize() to tokenize the sentence by words
# Obs: it is split the string by spaces or paranthesis
text1_tokens = nltk.word_tokenize(text1)

In [5]:
# We got 8 word tokens here
text1_tokens


Out[5]:
['I', 'am', 'learning', 'the', 'Natural', 'Language', 'Processing', '.']

In [6]:
sent1 = 'I am learning the Natural Language Processing. It is fun to Learn.'

In [7]:
# using .sent_tokenize() to tokenize the text by sentences
# Obs: it is split the string by paranthesis(after optional space)
sent1_tokens = nltk.sent_tokenize(sent1)

In [8]:
# We got 2 sentence tokens here
sent1_tokens


Out[8]:
['I am learning the Natural Language Processing.', 'It is fun to Learn.']

In [9]:
# We can loop through the sent tokens and generate the word tokens for each sent token
for sent_tkn in sent1_tokens:
    word_tkns = nltk.word_tokenize(sent_tkn)
    print(word_tkns)


['I', 'am', 'learning', 'the', 'Natural', 'Language', 'Processing', '.']
['It', 'is', 'fun', 'to', 'Learn', '.']

In [ ]: