In [1]:
categories = ['alt.atheism', 'soc.religion.christian',
'comp.graphics', 'sci.med']
In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True, random_state=42)
In [3]:
twenty_train.target_names
Out[3]:
In [5]:
print(len(twenty_train.data))
print(len(twenty_train.filenames))
In [24]:
ttdata = twenty_train.data
In [29]:
ttdata[0:2]
Out[29]:
In [6]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))
print(twenty_train.target_names[twenty_train.target[0]])
In [7]:
twenty_train.target[:10]
Out[7]:
In [8]:
for t in twenty_train.target[:10]:
print(twenty_train.target_names[t])
In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
Out[9]:
In [10]:
count_vect.vocabulary_.get(u'algorithm')
Out[10]:
In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape
Out[11]:
In [12]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
Out[12]:
In [13]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
In [31]:
clf.get_params()
Out[31]:
In [14]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
print('%r => %s' % (doc, twenty_train.target_names[category]))
In [16]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
In [17]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
In [18]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[18]:
So the Naive Bayes approach (MultinomialNB
), returned 83 % accuracy
In [19]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[19]:
The SVM approach returned 91% accuracy
In [21]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
target_names=twenty_test.target_names))
In [22]:
metrics.confusion_matrix(twenty_test.target, predicted)
Out[22]:
In [ ]: