notebook.community

Edit and run



In [1]:

    
# Tokenization is splitting the text/sentence into mutiple strings/words
# Use .word_tokenize() to tokenize by words, .sent_tokenize() to tokenize by sentences



In [2]:

    
import nltk



In [3]:

    
text1 = 'I am learning the Natural Language Processing.'



In [4]:

    
# using .word_tokenize() to tokenize the sentence by words
# Obs: it is split the string by spaces or paranthesis
text1_tokens = nltk.word_tokenize(text1)



In [5]:

    
# We got 8 word tokens here
text1_tokens









    Out[5]:





['I', 'am', 'learning', 'the', 'Natural', 'Language', 'Processing', '.']



In [6]:

    
sent1 = 'I am learning the Natural Language Processing. It is fun to Learn.'



In [7]:

    
# using .sent_tokenize() to tokenize the text by sentences
# Obs: it is split the string by paranthesis(after optional space)
sent1_tokens = nltk.sent_tokenize(sent1)



In [8]:

    
# We got 2 sentence tokens here
sent1_tokens









    Out[8]:





['I am learning the Natural Language Processing.', 'It is fun to Learn.']



In [9]:

    
# We can loop through the sent tokens and generate the word tokens for each sent token
for sent_tkn in sent1_tokens:
    word_tkns = nltk.word_tokenize(sent_tkn)
    print(word_tkns)









    



['I', 'am', 'learning', 'the', 'Natural', 'Language', 'Processing', '.']
['It', 'is', 'fun', 'to', 'Learn', '.']



In [ ]: