In [1]:
import re
from os.path import join
from glob import glob
from random import shuffle, seed
from statistics import mode
# import regex
from tqdm import tqdm_notebook
import nltk
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.classify import ClassifierI
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
In [2]:
STEMMER = SnowballStemmer('english')
# STEMMER = PorterStemmer()
SEED = 9745
TRAIN_FRACTION = 0.6
Extract features from the document.
In [3]:
def tokenize(file_name):
text = open(file_name).read().lower()
words = nltk.word_tokenize(text)
# Get tokens without stop words
words = [STEMMER.stem(w)
for w in words if w not in stopwords.words('english')]
# A word most have 3 or more characters with one letter
words = [w for w in words if len(w) >= 3 and re.match(r'[^\W\d\_]', w)]
return words
In [4]:
def build_corpus(locations):
corpus = []
for location, category in locations:
files = glob(join(location, '*.txt'))
for file_name in tqdm_notebook(files, desc=category):
corpus.append((tokenize(file_name), category))
return corpus
In [5]:
def build_frequency_dist(corpus):
all_words = []
for words, label in corpus:
all_words += words
return FreqDist(all_words)
In [6]:
def document_features(features, document):
words = set(document[0])
return {w: (w in words) for w in features}
In [7]:
corpus = build_corpus([('data/Rel-Yes', 'Rel-Yes'),
('data/Rel-No', 'Rel-No')])
shuffle(corpus)
all_words = build_frequency_dist(corpus)
In [8]:
word_features = list(all_words.keys())[:3000]
In [9]:
feature_sets = [(document_features(word_features, d), d[1]) for d in corpus]
train_test_split = int(len(feature_sets) * TRAIN_FRACTION)
train_set = feature_sets[:train_test_split]
test_set = feature_sets[train_test_split:]
In [10]:
nltk_classifier = nltk.NaiveBayesClassifier.train(train_set)
In [11]:
accuracy = nltk.classify.accuracy(nltk_classifier, test_set)
print(f'NLTK Accuracy: {accuracy:0.2f}')
In [12]:
nltk_classifier.show_most_informative_features(10)
In [13]:
mnb_classifier = SklearnClassifier(MultinomialNB())
mnb_classifier.train(train_set)
accuracy = nltk.classify.accuracy(mnb_classifier, test_set)
print(f'Sklearn MultinomialNB Accuracy: {accuracy:0.2f}')
In [14]:
# gnb_classifier = SklearnClassifier(GaussianNB())
# gnb_classifier.train(train_set)
# accuracy = nltk.classify.accuracy(gnb_classifier, test_set)
# print(f'Sklearn GaussianNB Accuracy: {accuracy:0.2f}')
In [15]:
bnb_classifier = SklearnClassifier(BernoulliNB())
bnb_classifier.train(train_set)
accuracy = nltk.classify.accuracy(bnb_classifier, test_set)
print(f'Sklearn BernoulliNB Accuracy: {accuracy:0.2f}')
In [16]:
lr_classifier = SklearnClassifier(LogisticRegression())
lr_classifier.train(train_set)
accuracy = nltk.classify.accuracy(lr_classifier, test_set)
print(f'Sklearn LogisticRegression Accuracy: {accuracy:0.2f}')
In [17]:
sgd_classifier = SklearnClassifier(SGDClassifier())
sgd_classifier.train(train_set)
accuracy = nltk.classify.accuracy(sgd_classifier, test_set)
print(f'Sklearn SGDClassifier Accuracy: {accuracy:0.2f}')
In [18]:
svc_classifier = SklearnClassifier(SVC())
svc_classifier.train(train_set)
accuracy = nltk.classify.accuracy(svc_classifier, test_set)
print(f'Sklearn SVC Accuracy: {accuracy:0.2f}')
In [19]:
lsvc_classifier = SklearnClassifier(LinearSVC())
lsvc_classifier.train(train_set)
accuracy = nltk.classify.accuracy(lsvc_classifier, test_set)
print(f'Sklearn LinearSVC Accuracy: {accuracy:0.2f}')
In [20]:
nusvc_classifier = SklearnClassifier(NuSVC())
nusvc_classifier.train(train_set)
accuracy = nltk.classify.accuracy(nusvc_classifier, test_set)
print(f'Sklearn NuSVC Accuracy: {accuracy:0.2f}')
In [ ]: