Code from Mastering Natural Langiage Processing with Python.
In [2]:
import nltk
# nltk.download()
In [2]:
text="Welcome readers. I hope you find it interesting. please fo reply."
In [3]:
from nltk.tokenize import sent_tokenize
In [4]:
sent_tokenize(text)
Out[4]:
In [5]:
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
In [6]:
text='Hello everyone. Hope you are all fine and doing well. Hope that you will find the book interesting.'
In [7]:
tokenizer.tokenize(text)
Out[7]:
In [8]:
text=nltk.word_tokenize("PeirreVinken, 59 years old, will join as a nonexecutive director on Nov. 29.")
In [9]:
print(text)
In [10]:
r=input("Please provide some imput text:")
In [11]:
from nltk import word_tokenize
print("The length of this word is", len(word_tokenize(r)),"words.")
In [12]:
from nltk.tokenize import TreebankWordTokenizer
In [13]:
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize("Have a nice day. I hope you find the book interesting"))
In [14]:
text=nltk.word_tokenize(" Don't hesitate to ask questions")
print(text)
In [15]:
from nltk.tokenize import WordPunctTokenizer
tokenizer=WordPunctTokenizer()
print(tokenizer.tokenize(" Don't hesitate to ask questions"))
But if you want to use regular expressions:
In [3]:
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer("[\w']+")
print(tokenizer.tokenize("Don't hesitate to ask questions"))
Instead of instantiating a class, here is an alternative way to tokenize with regexs:
In [17]:
from nltk.tokenize import regexp_tokenize
sent="Don't hesitate to ask questions"
print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))
And here is how we can
In [9]:
tokenizer=RegexpTokenizer('[\'\w\-]+',gaps=False)
print(tokenizer.tokenize("Don't hesitate to ask questions. Hi there! Antoio-b was 54 years old"))
How about selecting words with a capital letter:
In [19]:
sent=" She secured 90.56 % in class X . She is a meritorious student"
capt = RegexpTokenizer('[A-Z]\w+')
print(capt.tokenize(sent))
In [20]:
sent=" She secured 90.56 % in class X . She is a meritorious student"
from nltk.tokenize import BlanklineTokenizer
print(BlanklineTokenizer().tokenize(sent))
Tokenisation using a white_space tokenizer:
In [21]:
sent=" She secured 90.56 % in class X . She is a meritorious student"
from nltk.tokenize import WhitespaceTokenizer
print(WhitespaceTokenizer().tokenize(sent))
The split method can also be used to specify a white_space character or anything else:
In [22]:
sent= "She secured 90.56 % in class X. She is a meritorious student"
print(sent.split()) # Notice that these two...
print(sent.split(' ')) # ...are equivalent
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(sent.split('\n'))
The SpaceTokenizer works in a very similar way to sent.split(' ')
In [23]:
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
from nltk.tokenize import SpaceTokenizer
print(SpaceTokenizer().tokenize(sent))
What if all we want it to tokenize words into lines:
In [24]:
from nltk.tokenize import BlanklineTokenizer
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(BlanklineTokenizer().tokenize(sent))
from nltk.tokenize import LineTokenizer
print(LineTokenizer(blanklines='keep').tokenize(sent))
print(LineTokenizer(blanklines='discard').tokenize(sent))
It is the process of:
Let's start with punctuation:
In [25]:
import re
import string
text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."]
from nltk.tokenize import word_tokenize
tokenized_docs=[word_tokenize(doc) for doc in text]
x=re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
new_review = []
for token in review:
new_token = x.sub(u'', token)
if not new_token == u'':
new_review.append(new_token)
tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)
Let's convert text to lowercase and uppercase:
In [26]:
text='HARdWork IS KEy to SUCCESS'
print(text.lower())
print(text.upper())
In [27]:
from nltk.corpus import stopwords
stops=set(stopwords.words('english'))
words=["Don't", 'hesitate','to','ask','questions']
print([word for word in words if word not in stops])
In [28]:
# For other languages:
print(stopwords.fileids())
In [29]:
# Lets calculate stopwords in english
print(stopwords.words('english'))
In [30]:
def para_fraction(text):
stopwords = nltk.corpus.stopwords.words('english')
para = [w for w in text if w.lower() not in stopwords]
return len(para) / len(text)
In [31]:
print(para_fraction(nltk.corpus.reuters.words()))
In [32]:
print(para_fraction(nltk.corpus.inaugural.words()))
In [33]:
# Using regular expressions
In [34]:
import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
# Fixed this line - "patterns", not "pattern"
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
# Fixed indentation here
return s
rep=RegexpReplacer()
print(rep.replace("can't is a contradicton"))
In [35]:
replacer= RegexpReplacer()
replacer.replace("Don't hesitate to ask questions")
print(replacer.replace("She must've gone to the market but she didn't go"))
In [36]:
# Substitution can take place before tokenisation
replacer=RegexpReplacer()
word_tokenize("Don't hesitate to ask questions")
print(word_tokenize(replacer.replace("Don't hesitate to ask questions")))
Have a look at this
In [37]:
from nltk.corpus import wordnet
class RepeatReplacer(object):
""" Removes repeating characters until a valid word is found.
>>> replacer = RepeatReplacer()
>>> replacer.replace('looooove')
'love'
>>> replacer.replace('oooooh')
'ooh'
>>> replacer.replace('goose')
'goose'
"""
def __init__(self):
self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
self.repl = r'\1\2\3'
def replace(self, word):
if wordnet.synsets(word):
return word
repl_word = self.repeat_regexp.sub(self.repl, word)
if repl_word != word:
return self.replace(repl_word)
else:
return repl_word
In [38]:
replacer=RepeatReplacer()
print(replacer.replace('lotttt'))
print(replacer.replace('ohhhhh'))
print(replacer.replace('ooohhhhh'))
In [39]:
class WordReplacer(object):
""" WordReplacer that replaces a given word with a word from the word_map,
or if the word isn't found, returns the word as is.
>>> replacer = WordReplacer({'bday': 'birthday'})
>>> replacer.replace('bday')
'birthday'
>>> replacer.replace('happy')
'happy'
"""
def __init__(self, word_map):
self.word_map = word_map
def replace(self, word):
return self.word_map.get(word, word)
In [40]:
replacer=WordReplacer({'congrats':'congratulations'})
print(replacer.replace('congrats'))
print(replacer.replace('maths'))
In [41]:
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
fd = FreqDist()
for text in gutenberg.fileids():
for word in gutenberg.words(text):
fd[word] += 1
ranks = []
freqs = []
for rank, word in enumerate(fd):
ranks.append(rank+1)
freqs.append(fd[word])
plt.loglog(ranks,freqs)
plt.xlabel('frequency(f)', fontsize=14, fontweight='bold')
plt.ylabel('rank(r)', fontsize=14, fontweight='bold')
plt.grid(True)
fig = plt.figure(figsize=(40,20), dpi= 80, facecolor='w', edgecolor='k')
fig
Out[41]:
In [44]:
from __future__ import print_function
from nltk.metrics import *
training='PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split()
testing='PERSON OTHER OTHER OTHER OTHER OTHER'.split()
print('Accuracy:' + str(accuracy(training,testing)))
trainset=set(training)
testset=set(testing)
precision(trainset,testset)
print('Recall:' + str(recall(trainset,testset)))
print('F_measure:' + str(f_measure(trainset,testset)))
In [46]:
from nltk.metrics import *
print(edit_distance("relate","relation"))
print(edit_distance("suggestion","calculation"))
In [48]:
X=set([10,20,30,40])
Y=set([20,30,60])
print(jaccard_distance(X,Y))
In [58]:
X = set([10,20,30,40])
Y= set([30,50,70])
print(binary_distance(X, Y))
Or the masi distance, which is based on partial agreement when multiple labels are present:
In [62]:
X=set([10,20,30,40])
Y=set([30,50,70])
print(masi_distance(X,Y))
In [ ]: