notebook.community

Edit and run



In [16]:

    
'''
NLTK cookbook Naive Bayes Classifier.
'''
__author__ = 'Xia Wang'



In [17]:

    
from nltk.classify import util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews



In [18]:

    
# simplied feature extraction method
def word_feats(words):
    return dict([(word, True) for word in words])



In [19]:

    
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')



In [24]:

    
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]



In [25]:

    
negcutoff = len(negfeats) * 3/4
poscutoff = len(posfeats) * 3/4



In [26]:

    
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]



In [27]:

    
nbmdl = NaiveBayesClassifier.train(trainfeats)
print 'The accuracy is {}'.format(util.accuracy(nbmdl, testfeats))
nbmdl.show_most_informative_features()









    



The accuracy is 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0



In [ ]: