In [1]:
import nltk
from nltk.tokenize import word_tokenize
word_tokenize('Hello World.')
Out[1]:
In [2]:
#this is equivalent to
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize('Hello World.')
Out[2]:
In [5]:
#PunktWordTokenizer
#An alternative word tokenizer is PunktWordTokenizer. It splits on punctuation, but keeps it with the word instead of creating separate tokens, as shown in the following code:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
tokenizer.tokenize("Can't is a contraction.")
Out[5]:
In [6]:
#tokenzing using regex
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("Can't is a contraction.")
Out[6]:
In [15]:
#tokenizing data from corpora
from nltk.tokenize import sent_tokenize
from nltk.corpus import webtext
text = webtext.raw('overheard.txt')
sents2 = sent_tokenize(text)
sents2[0]
sents2[678]
Out[15]:
In [17]:
# Filtering stopwords from corpora
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
words = ["Can't", 'is', 'a', 'contraction']
[word for word in words if word not in english_stops]
Out[17]:
In [18]:
stopwords.fileids()
Out[18]:
In [19]:
# WordNet is a lexical database for the English language. In other words, it's a dictionary designed speci cally for natural language processing.
from nltk.corpus import wordnet
syn = wordnet.synsets('cookbook')[0]
syn.name()
syn.definition()
Out[19]:
In [20]:
# Each Synset in the list has a number of methods you can use to learn more about it. The name() method will give you a unique name for the Synset, which you can use to get the Synset directly:
wordnet.synset('cookbook.n.01')
Out[20]:
In [22]:
#symplified POS
syn.pos()
Out[22]:
In [24]:
#Synsets
(wordnet.synsets('great'))
Out[24]:
In [29]:
# All possible synonyms ..
#As mentioned earlier, many words have multiple Synsets because the word can have different meanings depending on the context.
#But, let's say you didn't care about the context, and wanted to get all the possible synonyms for a word:
synonyms = []
for syn in wordnet.synsets('book'):
print syn
for lemma in syn.lemmas():
synonyms.append(lemma.name())
len(synonyms)
Out[29]:
In [32]:
# Stemming
#The PorterStemmer class knows a number of regular word forms and suf xes and uses this knowledge ..
# ..to transform your input word to a nal stem through a series of steps.
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('cooking')
Out[32]:
In [33]:
#The LancasterStemmer class
#The functions of the LancasterStemmer class are just like the functions of the PorterStemmer class..
#..but can produce slightly different results. It is known to be slightly more aggressive than the PorterStemmer functions:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print stemmer.stem('cooking')
print stemmer.stem('cookery')
In [35]:
#The RegexpStemmer class
#You can also construct your own stemmer using the RegexpStemmer class.
#It takes a single regular expression (either compiled or as a string) and removes any pre x or suf x that matches the expression:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing')
print stemmer.stem('cooking')
print stemmer.stem('cookery')
print stemmer.stem('ingleside')
In [36]:
# DEFAULT TAGGER
from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
tagger.tag(['Hello', 'World'])
Out[36]:
In [37]:
#Evaluation
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)
Out[37]:
In [41]:
# So, by just choosing NN for every tag, we can achieve 14 % accuracy testing on one-fourth of the treebank corpus.
#Of course, accuracy will be different if you choose a different default tag.
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)
treebank.sents()[0]
Out[41]:
In [42]:
tagger.tag(treebank.sents()[0])
Out[42]:
In [43]:
tagger.evaluate(test_sents)
Out[43]:
In [45]:
# Backoff tagging is one of the core features of SequentialBackoffTagger..
#..It allows you to chain taggers together so that if one tagger doesn't know how to tag a word, ..
#..it can pass the word on to the next backoff tagger. If that one can't do it, ..
#..it can pass the word on to the next backoff tagger, ..
# ..and so on until there are no backoff taggers left to check.
tagger1 = DefaultTagger('NN')
tagger2 = UnigramTagger(train_sents, backoff=tagger1)
tagger2.evaluate(test_sents)
Out[45]:
In [47]:
# How does this work
print tagger1._taggers == [tagger1]
print tagger2._taggers == [tagger2, tagger1]
In [49]:
#By themselves, BigramTagger and TrigramTagger perform quite poorly.
#This is partly because they cannot learn context from the rst word(s) in a sentenc
from nltk.tag import BigramTagger, TrigramTagger
bitagger = BigramTagger(train_sents)
print bitagger.evaluate(test_sents)
tritagger = TrigramTagger(train_sents)
print tritagger.evaluate(test_sents)
In [51]:
# Lets write a backoff tagger
def backoff_tagger(train_sents, tagger_classes, backoff=None):
for cls in tagger_classes:
backoff = cls(train_sents, backoff=backoff)
return backoff
In [52]:
backoff = DefaultTagger('NN')
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,TrigramTagger], backoff=backoff)
tagger.evaluate(test_sents)
Out[52]:
In [53]:
#Quadgram tagger
#The NgramTagger class can be used by itself to create a tagger that uses more than three ngrams for its context key.
from nltk.tag import NgramTagger
quadtagger = NgramTagger(4, train_sents)
quadtagger.evaluate(test_sents)
Out[53]:
In [ ]: