Naive Bayes classifier


In [1]:
import re
from os.path import join
from glob import glob
from random import shuffle, seed
from statistics import mode

# import regex
from tqdm import tqdm_notebook

import nltk
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.classify import ClassifierI
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [2]:
STEMMER = SnowballStemmer('english')
# STEMMER = PorterStemmer()

SEED = 9745
TRAIN_FRACTION = 0.6

Extract features from the document.


In [3]:
def tokenize(file_name):
    text = open(file_name).read().lower()
    words = nltk.word_tokenize(text)

    # Get tokens without stop words
    words = [STEMMER.stem(w)
             for w in words if w not in stopwords.words('english')]

    # A word most have 3 or more characters with one letter
    words = [w for w in words if len(w) >= 3 and re.match(r'[^\W\d\_]', w)]

    return words

In [4]:
def build_corpus(locations):
    corpus = []

    for location, category in locations:
        files = glob(join(location, '*.txt'))
        for file_name in tqdm_notebook(files, desc=category):
            corpus.append((tokenize(file_name), category))

    return corpus

In [5]:
def build_frequency_dist(corpus):
    all_words = []

    for words, label in corpus:
        all_words += words

    return FreqDist(all_words)

In [6]:
def document_features(features, document):
    words = set(document[0])
    return {w: (w in words) for w in features}

In [7]:
corpus = build_corpus([('data/Rel-Yes', 'Rel-Yes'),
                       ('data/Rel-No', 'Rel-No')])
shuffle(corpus)

all_words = build_frequency_dist(corpus)





In [8]:
word_features = list(all_words.keys())[:3000]

In [9]:
feature_sets = [(document_features(word_features, d), d[1]) for d in corpus]

train_test_split = int(len(feature_sets) * TRAIN_FRACTION)

train_set = feature_sets[:train_test_split]
test_set = feature_sets[train_test_split:]

In [10]:
nltk_classifier = nltk.NaiveBayesClassifier.train(train_set)

In [11]:
accuracy = nltk.classify.accuracy(nltk_classifier, test_set)
print(f'NLTK Accuracy: {accuracy:0.2f}')


NLTK Accuracy: 0.67

In [12]:
nltk_classifier.show_most_informative_features(10)


Most Informative Features
              figueiredo = True           Rel-Ye : Rel-No =      8.0 : 1.0
                unattain = True           Rel-Ye : Rel-No =      8.0 : 1.0
                isotherm = True           Rel-Ye : Rel-No =      8.0 : 1.0
               distribut = False          Rel-No : Rel-Ye =      7.3 : 1.0
                    krug = True           Rel-Ye : Rel-No =      6.2 : 1.0
               misrepres = True           Rel-Ye : Rel-No =      6.2 : 1.0
              villarroya = True           Rel-Ye : Rel-No =      6.2 : 1.0
                 quantil = True           Rel-Ye : Rel-No =      5.0 : 1.0
                  inform = False          Rel-No : Rel-Ye =      4.9 : 1.0
             misidentific = True           Rel-Ye : Rel-No =      4.8 : 1.0

In [13]:
mnb_classifier = SklearnClassifier(MultinomialNB())
mnb_classifier.train(train_set)

accuracy = nltk.classify.accuracy(mnb_classifier, test_set)
print(f'Sklearn MultinomialNB Accuracy: {accuracy:0.2f}')


Sklearn MultinomialNB Accuracy: 0.65

In [14]:
# gnb_classifier = SklearnClassifier(GaussianNB())
# gnb_classifier.train(train_set)

# accuracy = nltk.classify.accuracy(gnb_classifier, test_set)
# print(f'Sklearn GaussianNB Accuracy: {accuracy:0.2f}')

In [15]:
bnb_classifier = SklearnClassifier(BernoulliNB())
bnb_classifier.train(train_set)

accuracy = nltk.classify.accuracy(bnb_classifier, test_set)
print(f'Sklearn BernoulliNB Accuracy: {accuracy:0.2f}')


Sklearn BernoulliNB Accuracy: 0.67

In [16]:
lr_classifier = SklearnClassifier(LogisticRegression())
lr_classifier.train(train_set)

accuracy = nltk.classify.accuracy(lr_classifier, test_set)
print(f'Sklearn LogisticRegression Accuracy: {accuracy:0.2f}')


Sklearn LogisticRegression Accuracy: 0.71

In [17]:
sgd_classifier = SklearnClassifier(SGDClassifier())
sgd_classifier.train(train_set)

accuracy = nltk.classify.accuracy(sgd_classifier, test_set)
print(f'Sklearn SGDClassifier Accuracy: {accuracy:0.2f}')


Sklearn SGDClassifier Accuracy: 0.73

In [18]:
svc_classifier = SklearnClassifier(SVC())
svc_classifier.train(train_set)

accuracy = nltk.classify.accuracy(svc_classifier, test_set)
print(f'Sklearn SVC Accuracy: {accuracy:0.2f}')


Sklearn SVC Accuracy: 0.67

In [19]:
lsvc_classifier = SklearnClassifier(LinearSVC())
lsvc_classifier.train(train_set)

accuracy = nltk.classify.accuracy(lsvc_classifier, test_set)
print(f'Sklearn LinearSVC Accuracy: {accuracy:0.2f}')


Sklearn LinearSVC Accuracy: 0.71

In [20]:
nusvc_classifier = SklearnClassifier(NuSVC())
nusvc_classifier.train(train_set)

accuracy = nltk.classify.accuracy(nusvc_classifier, test_set)
print(f'Sklearn NuSVC Accuracy: {accuracy:0.2f}')


Sklearn NuSVC Accuracy: 0.71

In [ ]: