Normalizing is the act of cleaning text data to make it uniform Ex:
In [1]:
import nltk
In [2]:
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
In [3]:
# Just printing first 20 words of the book
alice[:20]
Out[3]:
In [4]:
# taking into variable for processing these words
alice_20 = alice[:20]
In [5]:
# For removing all non alphabetic words we can use .isalpha() method
# Obs: We have removed all the other non alphabetic character words
for word in alice_20:
if word.isalpha():
print(word)
In [6]:
# To do further normalize the text data we can convert the all words to lower case
# to do so we can use .lower() method on string
for word in alice_20:
print(word.lower())
In [7]:
# Now combining the above two a) keeping only alphabetic words, b) converting to lower
# using list compression to forming the new list and using the same in iterating items ex: for <item> in <list>
for word in [word.lower() for word in alice_20 if word.isalpha()]:
print(word)
In [8]:
# lets have some sample data and see how we can use stemmers and lemmetizers
# Stemmers are faster to run but doesnt give reliable results
# Lemmetizers are compute intensive(slow to run) but does analysis and gives results(sometimes this also not reliable)
sample_data = ['cats', 'cat', 'lie', 'lying', 'fly', 'flying', 'run', 'ran', 'year', 'yearly',
'puppy', 'puppies', 'woman', 'women', 'fast', 'faster']
In [9]:
# Using the PorterStemmer
porter = nltk.PorterStemmer()
In [10]:
# Obs: Some words were able to be normalize correctly whereas some are not
for word in sample_data:
print(porter.stem(word))
In [11]:
# Lets try another Stemmer
lancaster = nltk.LancasterStemmer()
In [12]:
# Obs: same as above, Some words were able to be normalize correctly whereas some are not
# So we cannot completely rely on one Stemmer to normalize the text
for word in sample_data:
print(lancaster.stem(word))
In [13]:
# Lets have a look at the Lemmetizer now
wnlem = nltk.WordNetLemmatizer()
In [14]:
# Obs: .lemmatize is a compute intensive operation
# This also not able to normalize the words fully.
for word in sample_data:
print(wnlem.lemmatize(word))
In [ ]: