In [4]:
# This is the Jupyter code for Text Classification with Naive Bayers
%pylab inline
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
In [8]:
print news.keys()
In [9]:
print news.description
In [11]:
print len(news.data)
print len(news.target_names)
In [12]:
print news.target_names
In [14]:
# have a look for one instance
print news.data[0]
print news.target_names[news.target[0]]
In [15]:
# Split the data into train and test
SPLIT_PREC = 0.75
split_size = int(len(news.data) * SPLIT_PREC)
X_train = news.data[:split_size]
X_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]
In [20]:
# there are three different classes can transform text into numeric features
# - CountVectorizer
# - HashingVectorizer
# - TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
# 1st classifier with CountVectorizer
clf_1 = Pipeline([
('vect', CountVectorizer()),
('clf', MultinomialNB()),
])
# 2nd classifier with HashingVectorizer
clf_2 = Pipeline([
('vect', HashingVectorizer(non_negative=True)),
('clf', MultinomialNB()),
])
# 3rd classifier with TfidfVectorizer
clf_3 = Pipeline([
('vect', TfidfVectorizer()),
('clf', MultinomialNB()),
])
In [21]:
print clf_1
In [22]:
print clf_2
In [23]:
print clf_3
In [26]:
# define a validation function
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
def evaluate_cross_validation(clf, X, y, K):
cv = KFold(len(y), K, shuffle=True, random_state=0)
scores = cross_val_score(clf, X, y, cv = cv)
print scores
print ("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores))
In [27]:
# run the validation
clfs = [clf_1, clf_2, clf_3]
for clf in clfs:
evaluate_cross_validation(clf, news.data, news.target, 5)
In [28]:
# create another clf with regular expression to limit the words being counted
clf_4 = Pipeline([
('vect', TfidfVectorizer(
token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",)),
('clf', MultinomialNB()),
])
evaluate_cross_validation(clf_4, news.data, news.target, 5)
In [32]:
# apply stop words
# TODO: need to download the stopwords_en.txt
def get_stop_words():
result = set()
for line in open('stopwords_en.txt', 'r').readlines():
result.add(line.strip())
return result
# create the clf
clf_5 = Pipeline([
('vect', TfidfVectorizer(
stop_words=get_stop_words(),
token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",)),
('clf', MultinomialNB()),
])
evaluate_cross_validation(clf_5, news.data, news.target, 5)
In [ ]: