notebook.community

Edit and run



In [1]:

    
import nltk
from nltk.tokenize import word_tokenize
word_tokenize('Hello World.')









    Out[1]:





['Hello', 'World', '.']



In [2]:

    
#this is equivalent to
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize('Hello World.')









    Out[2]:





['Hello', 'World', '.']



In [5]:

    
#PunktWordTokenizer
#An alternative word tokenizer is PunktWordTokenizer. It splits on punctuation, but keeps it with the word instead of creating separate tokens, as shown in the following code:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
tokenizer.tokenize("Can't is a contraction.")









    Out[5]:





['Can', "'", 't', 'is', 'a', 'contraction', '.']



In [6]:

    
#tokenzing using regex
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("Can't is a contraction.")









    Out[6]:





["Can't", 'is', 'a', 'contraction']



In [15]:

    
#tokenizing data from corpora
from nltk.tokenize import sent_tokenize
from nltk.corpus import webtext
text = webtext.raw('overheard.txt')
sents2 = sent_tokenize(text)
sents2[0] 
sents2[678]









    Out[15]:





u'Girl: But you already have a Big Mac...\nHobo: Oh, this is all theatrical.'



In [17]:

    
# Filtering stopwords from corpora
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
words = ["Can't", 'is', 'a', 'contraction']
[word for word in words if word not in english_stops]









    Out[17]:





["Can't", 'contraction']



In [18]:

    
stopwords.fileids()









    Out[18]:





[u'danish',
 u'dutch',
 u'english',
 u'finnish',
 u'french',
 u'german',
 u'hungarian',
 u'italian',
 u'kazakh',
 u'norwegian',
 u'portuguese',
 u'russian',
 u'spanish',
 u'swedish',
 u'turkish']



In [19]:

    
# WordNet is a lexical database for the English language. In other words, it's a dictionary designed speci cally for natural language processing.
from nltk.corpus import wordnet
syn = wordnet.synsets('cookbook')[0]
syn.name()
syn.definition()









    Out[19]:





u'a book of recipes and cooking directions'



In [20]:

    
# Each Synset in the list has a number of methods you can use to learn more about it. The name() method will give you a unique name for the Synset, which you can use to get the Synset directly:
wordnet.synset('cookbook.n.01')









    Out[20]:





Synset('cookbook.n.01')



In [22]:

    
#symplified POS
syn.pos()









    Out[22]:





u'n'



In [24]:

    
#Synsets 
(wordnet.synsets('great'))









    Out[24]:





[Synset('great.n.01'),
 Synset('great.s.01'),
 Synset('great.s.02'),
 Synset('great.s.03'),
 Synset('bang-up.s.01'),
 Synset('capital.s.03'),
 Synset('big.s.13')]



In [29]:

    
# All possible synonyms .. 
#As mentioned earlier, many words have multiple Synsets because the word can have different meanings depending on the context. 
#But, let's say you didn't care about the context, and wanted to get all the possible synonyms for a word:
synonyms = []
for syn in wordnet.synsets('book'):
     print syn
     for lemma in syn.lemmas():
        synonyms.append(lemma.name())
len(synonyms)









    



Synset('book.n.01')
Synset('book.n.02')
Synset('record.n.05')
Synset('script.n.01')
Synset('ledger.n.01')
Synset('book.n.06')
Synset('book.n.07')
Synset('koran.n.01')
Synset('bible.n.01')
Synset('book.n.10')
Synset('book.n.11')
Synset('book.v.01')
Synset('reserve.v.04')
Synset('book.v.03')
Synset('book.v.04')






    Out[29]:





38



In [32]:

    
# Stemming
#The PorterStemmer class knows a number of regular word forms and suf xes and uses this knowledge ..
# ..to transform your input word to a  nal stem through a series of steps.
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('cooking')









    Out[32]:





u'cook'



In [33]:

    
#The LancasterStemmer class
#The functions of the LancasterStemmer class are just like the functions of the PorterStemmer class..
#..but can produce slightly different results. It is known to be slightly more aggressive than the PorterStemmer functions:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print stemmer.stem('cooking')
print stemmer.stem('cookery')









    



cook
cookery



In [35]:

    
#The RegexpStemmer class
#You can also construct your own stemmer using the RegexpStemmer class. 
#It takes a single regular expression (either compiled or as a string) and removes any pre x or suf x that matches the expression:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing')
print stemmer.stem('cooking')
print stemmer.stem('cookery')
print stemmer.stem('ingleside')









    



cook
cookery
leside



In [36]:

    
# DEFAULT TAGGER
from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
tagger.tag(['Hello', 'World'])









    Out[36]:





[('Hello', 'NN'), ('World', 'NN')]



In [37]:

    
#Evaluation
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)









    Out[37]:





0.14331966328512843



In [41]:

    
# So, by just choosing NN for every tag, we can achieve 14 % accuracy testing on one-fourth of the treebank corpus. 
#Of course, accuracy will be different if you choose a different default tag. 
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)
treebank.sents()[0]









    



<UnigramTagger: size=10779>






    Out[41]:





[u'Pierre',
 u'Vinken',
 u',',
 u'61',
 u'years',
 u'old',
 u',',
 u'will',
 u'join',
 u'the',
 u'board',
 u'as',
 u'a',
 u'nonexecutive',
 u'director',
 u'Nov.',
 u'29',
 u'.']



In [42]:

    
tagger.tag(treebank.sents()[0])









    Out[42]:





[(u'Pierre', u'NNP'),
 (u'Vinken', u'NNP'),
 (u',', u','),
 (u'61', u'CD'),
 (u'years', u'NNS'),
 (u'old', u'JJ'),
 (u',', u','),
 (u'will', u'MD'),
 (u'join', u'VB'),
 (u'the', u'DT'),
 (u'board', u'NN'),
 (u'as', u'IN'),
 (u'a', u'DT'),
 (u'nonexecutive', u'JJ'),
 (u'director', u'NN'),
 (u'Nov.', u'NNP'),
 (u'29', u'CD'),
 (u'.', u'.')]



In [43]:

    
tagger.evaluate(test_sents)









    Out[43]:





0.8588819339520829



In [45]:

    
# Backoff tagging is one of the core features of SequentialBackoffTagger..
#..It allows you to chain taggers together so that if one tagger doesn't know how to tag a word, ..
#..it can pass the word on to the next backoff tagger. If that one can't do it, ..
#..it can pass the word on to the next backoff tagger, ..
# ..and so on until there are no backoff taggers left to check.

tagger1 = DefaultTagger('NN')
tagger2 = UnigramTagger(train_sents, backoff=tagger1)
tagger2.evaluate(test_sents)









    Out[45]:





0.8758471832505935



In [47]:

    
# How does this work
print tagger1._taggers == [tagger1]
print tagger2._taggers == [tagger2, tagger1]









    



True
True



In [49]:

    
#By themselves, BigramTagger and TrigramTagger perform quite poorly. 
#This is partly because they cannot learn context from the  rst word(s) in a sentenc

from nltk.tag import BigramTagger, TrigramTagger
bitagger = BigramTagger(train_sents)
print bitagger.evaluate(test_sents)
tritagger = TrigramTagger(train_sents)
print tritagger.evaluate(test_sents)









    



0.11310166199
0.0688107058062



In [51]:

    
# Lets write a backoff tagger
def backoff_tagger(train_sents, tagger_classes, backoff=None):
     for cls in tagger_classes:
       backoff = cls(train_sents, backoff=backoff)
     return backoff



In [52]:

    
backoff = DefaultTagger('NN')
tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger,TrigramTagger], backoff=backoff)
tagger.evaluate(test_sents)









    Out[52]:





0.8806820634578028



In [53]:

    
#Quadgram tagger
#The NgramTagger class can be used by itself to create a tagger that uses more than three ngrams for its context key.
from nltk.tag import NgramTagger
quadtagger = NgramTagger(4, train_sents)
quadtagger.evaluate(test_sents)









    Out[53]:





0.058234405352903085



In [ ]: