We define functions to tokenize and normalize. We tokenize the input and then normalize each token. This first example is here only to illustrate the general program logic. If you try to run it, it returns an error because we haven’t defined the functions yet.
In [ ]:
def tokenize(input):
pass
def normalize(input):
pass
sample = "Hello, Mom!"
tokens = tokenize(sample)
# print(tokens)
normalized = [normalize(token) for token in tokens]
print(normalized)
In [ ]:
def tokenize(input): # tokenize on white space
return input.split()
def normalize(input): # normalize as lower case
return (input, input.lower())
sample = "Hello, Mom!"
tokens = tokenize(sample)
# print(tokens)
normalized = [normalize(token) for token in tokens]
print(normalized)
In [ ]:
import nltk
def tokenize(input): # use NLTK word tokenization
return nltk.word_tokenize(input)
def normalize(input): # normalize as POS
pos = nltk.pos_tag([input]) # since it’s a single word, make it a list
print(pos)
return (pos)
sample = "Hello, Mom!"
tokens = tokenize(sample)
# print(tokens)
normalized = [normalize(token) for token in tokens]
print(normalized)
In [ ]:
import re
def tokenize(input): # use NLTK word tokenization
return nltk.word_tokenize(input)
def normalize(input): # normalize as POS
return (input, re.sub('[AEIOUaeiou]','',input))
sample = "Hello, Mom!"
tokens = tokenize(sample)
# print(tokens)
normalized = [normalize(token) for token in tokens]
print(normalized)