In [16]:
'''
NLTK cookbook Naive Bayes Classifier.
'''
__author__ = 'Xia Wang'

In [17]:
from nltk.classify import util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

In [18]:
# simplied feature extraction method
def word_feats(words):
    return dict([(word, True) for word in words])

In [19]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [24]:
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

In [25]:
negcutoff = len(negfeats) * 3/4
poscutoff = len(posfeats) * 3/4

In [26]:
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

In [27]:
nbmdl = NaiveBayesClassifier.train(trainfeats)
print 'The accuracy is {}'.format(util.accuracy(nbmdl, testfeats))
nbmdl.show_most_informative_features()


The accuracy is 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0

In [ ]: