In [1]:
import nltk
from nltk.tokenize import word_tokenize
word_tokenize('Hello World.')


Out[1]:
['Hello', 'World', '.']

In [2]:
#this is equivalent to
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize('Hello World.')


Out[2]:
['Hello', 'World', '.']

In [5]:
#PunktWordTokenizer
#An alternative word tokenizer is PunktWordTokenizer. It splits on punctuation, but keeps it with the word instead of creating separate tokens, as shown in the following code:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
tokenizer.tokenize("Can't is a contraction.")


Out[5]:
['Can', "'", 't', 'is', 'a', 'contraction', '.']

In [6]:
#tokenzing using regex
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("Can't is a contraction.")


Out[6]:
["Can't", 'is', 'a', 'contraction']

In [15]:
#tokenizing data from corpora
from nltk.tokenize import sent_tokenize
from nltk.corpus import webtext
text = webtext.raw('overheard.txt')
sents2 = sent_tokenize(text)
sents2[0] 
sents2[678]


Out[15]:
u'Girl: But you already have a Big Mac...\nHobo: Oh, this is all theatrical.'

In [17]:
# Filtering stopwords from corpora
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
words = ["Can't", 'is', 'a', 'contraction']
[word for word in words if word not in english_stops]


Out[17]:
["Can't", 'contraction']

In [18]:
stopwords.fileids()


Out[18]:
[u'danish',
 u'dutch',
 u'english',
 u'finnish',
 u'french',
 u'german',
 u'hungarian',
 u'italian',
 u'kazakh',
 u'norwegian',
 u'portuguese',
 u'russian',
 u'spanish',
 u'swedish',
 u'turkish']

In [19]:
# WordNet is a lexical database for the English language. In other words, it's a dictionary designed speci cally for natural language processing.
from nltk.corpus import wordnet
syn = wordnet.synsets('cookbook')[0]
syn.name()
syn.definition()


Out[19]:
u'a book of recipes and cooking directions'

In [20]:
# Each Synset in the list has a number of methods you can use to learn more about it. The name() method will give you a unique name for the Synset, which you can use to get the Synset directly:
wordnet.synset('cookbook.n.01')


Out[20]:
Synset('cookbook.n.01')

In [22]:
#symplified POS
syn.pos()


Out[22]:
u'n'

In [24]:
#Synsets 
(wordnet.synsets('great'))


Out[24]:
[Synset('great.n.01'),
 Synset('great.s.01'),
 Synset('great.s.02'),
 Synset('great.s.03'),
 Synset('bang-up.s.01'),
 Synset('capital.s.03'),
 Synset('big.s.13')]

In [29]:
# All possible synonyms .. 
#As mentioned earlier, many words have multiple Synsets because the word can have different meanings depending on the context. 
#But, let's say you didn't care about the context, and wanted to get all the possible synonyms for a word:
synonyms = []
for syn in wordnet.synsets('book'):
     print syn
     for lemma in syn.lemmas():
        synonyms.append(lemma.name())
len(synonyms)


Synset('book.n.01')
Synset('book.n.02')
Synset('record.n.05')
Synset('script.n.01')
Synset('ledger.n.01')
Synset('book.n.06')
Synset('book.n.07')
Synset('koran.n.01')
Synset('bible.n.01')
Synset('book.n.10')
Synset('book.n.11')
Synset('book.v.01')
Synset('reserve.v.04')
Synset('book.v.03')
Synset('book.v.04')
Out[29]:
38

In [32]:
# Stemming
#The PorterStemmer class knows a number of regular word forms and suf xes and uses this knowledge ..
# ..to transform your input word to a  nal stem through a series of steps.
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('cooking')


Out[32]:
u'cook'

In [33]:
#The LancasterStemmer class
#The functions of the LancasterStemmer class are just like the functions of the PorterStemmer class..
#..but can produce slightly different results. It is known to be slightly more aggressive than the PorterStemmer functions:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print stemmer.stem('cooking')
print stemmer.stem('cookery')


cook
cookery

In [35]:
#The RegexpStemmer class
#You can also construct your own stemmer using the RegexpStemmer class. 
#It takes a single regular expression (either compiled or as a string) and removes any pre x or suf x that matches the expression:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing')
print stemmer.stem('cooking')
print stemmer.stem('cookery')
print stemmer.stem('ingleside')


cook
cookery
leside

In [36]:
# DEFAULT TAGGER
from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
tagger.tag(['Hello', 'World'])


Out[36]:
[('Hello', 'NN'), ('World', 'NN')]

In [37]:
#Evaluation
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)


Out[37]:
0.14331966328512843

In [41]:
# So, by just choosing NN for every tag, we can achieve 14 % accuracy testing on one-fourth of the treebank corpus. 
#Of course, accuracy will be different if you choose a different default tag. 
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)
treebank.sents()[0]


<UnigramTagger: size=10779>
Out[41]:
[u'Pierre',
 u'Vinken',
 u',',
 u'61',
 u'years',
 u'old',
 u',',
 u'will',
 u'join',
 u'the',
 u'board',
 u'as',
 u'a',
 u'nonexecutive',
 u'director',
 u'Nov.',
 u'29',
 u'.']

In [42]:
tagger.tag(treebank.sents()[0])


Out[42]:
[(u'Pierre', u'NNP'),
 (u'Vinken', u'NNP'),
 (u',', u','),
 (u'61', u'CD'),
 (u'years', u'NNS'),
 (u'old', u'JJ'),
 (u',', u','),
 (u'will', u'MD'),
 (u'join', u'VB'),
 (u'the', u'DT'),
 (u'board', u'NN'),
 (u'as', u'IN'),
 (u'a', u'DT'),
 (u'nonexecutive', u'JJ'),
 (u'director', u'NN'),
 (u'Nov.', u'NNP'),
 (u'29', u'CD'),
 (u'.', u'.')]

In [43]:
tagger.evaluate(test_sents)


Out[43]:
0.8588819339520829

In [45]:
# Backoff tagging is one of the core features of SequentialBackoffTagger..
#..It allows you to chain taggers together so that if one tagger doesn't know how to tag a word, ..
#..it can pass the word on to the next backoff tagger. If that one can't do it, ..
#..it can pass the word on to the next backoff tagger, ..
# ..and so on until there are no backoff taggers left to check.

tagger1 = DefaultTagger('NN')
tagger2 = UnigramTagger(train_sents, backoff=tagger1)
tagger2.evaluate(test_sents)


Out[45]:
0.8758471832505935

In [47]:
# How does this work
print tagger1._taggers == [tagger1]
print tagger2._taggers == [tagger2, tagger1]


True
True

In [49]:
#By themselves, BigramTagger and TrigramTagger perform quite poorly. 
#This is partly because they cannot learn context from the  rst word(s) in a sentenc

from nltk.tag import BigramTagger, TrigramTagger
bitagger = BigramTagger(train_sents)
print bitagger.evaluate(test_sents)
tritagger = TrigramTagger(train_sents)
print tritagger.evaluate(test_sents)


0.11310166199
0.0688107058062

In [51]:
# Lets write a backoff tagger
def backoff_tagger(train_sents, tagger_classes, backoff=None):
     for cls in tagger_classes:
       backoff = cls(train_sents, backoff=backoff)
     return backoff

In [52]:
backoff = DefaultTagger('NN')
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,TrigramTagger], backoff=backoff)
tagger.evaluate(test_sents)


Out[52]:
0.8806820634578028

In [53]:
#Quadgram tagger
#The NgramTagger class can be used by itself to create a tagger that uses more than three ngrams for its context key.
from nltk.tag import NgramTagger
quadtagger = NgramTagger(4, train_sents)
quadtagger.evaluate(test_sents)


Out[53]:
0.058234405352903085

In [ ]: