In [1]:
%load_ext watermark
%watermark -a 'Vahid Mirjalili' -d -p scikit-learn,numpy,numexpr,pandas,matplotlib,plotly -v
In [2]:
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import scipy
import logging
import numpy as np
from timeit import timeit
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
categories = None
remove = ('headers', 'footers', 'quotes')
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42,
remove=remove)
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42,
remove=remove)
categories = data_train.target_names
print("Categories: %s" %categories)
y_train, y_test = data_train.target, data_test.target
print("Dataset size: Training: %d Testing: %d" % (y_train.shape[0], y_test.shape[0]))
TF-IDF stands for Term frequency inverse document frequency, is a statistical measure to see how important the appearance of a word in a document corpus is for classification. Term frequency measures how many times a term has appeared in a particular document, and inverse document frequency measures logarithm inverse of number of ducuments that have that word out of total number of documents. Augmented TF-IDF is used to avoid bias towards longer documents, and it is defined as below for a term $t$ appearing in a particular document $d$:
$$ TF(t, d) = 0.5 + \frac{0.5 \times freq(t, d)}{max(freq(w, d) \ \ where\ w \in d)} $$$$IDF(t, D_{train}) = \frac{|D_{train}|}{|\{d\in D_{train} \ \ where\ t \in d\}|}$$and
$$TFIDF(t, d, D) = TF(t,d) \times IDF(t, d)$$where $D_{train}=\{set\ of\ all\ training\ documents\}$. It is important to note that TF-IDF depends on the entire set of documents that we are considering (for example the training set), and not just the term in a document.
In [4]:
### Vectorizing the training set:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
print("Number of samples N= %d, Number of features d= %d" % X_train.shape)
### Transforming the test dataset:
X_test = vectorizer.transform(data_test.data)
print("Number of Test Documents: %d, Number of features: %d" %X_test.shape)
In [5]:
### Train a classifier object and test it on the test set:
def apply_classifier(clf):
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
score = metrics.f1_score(y_test, pred)
return(score)
In [6]:
scores = {}
%timeit scores["BernoulliNB"] = apply_classifier(BernoulliNB(alpha=.01))
%timeit scores["MultinomialNB"] = apply_classifier(MultinomialNB(alpha=.01))
%timeit scores["SGD-classification"] = apply_classifier(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))
In [7]:
scores
Out[7]:
In [ ]: