In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
print(type(twenty_train), "\n")
print(twenty_train.target[:5], twenty_train.target_names, "\n")
print(twenty_train.data[:2])
In [2]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
Out[2]:
In [3]:
X_train_counts
Out[3]:
In [4]:
count_vect.vocabulary_.get(u'algorithmic')
Out[4]:
In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape
Out[5]:
In [6]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape
Out[6]:
In [7]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
print('{doc} => {category}'.format(
doc=doc, category=twenty_train.target_names[category]))
In [8]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
In [9]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[9]:
In [10]:
from sklearn import metrics
print(metrics.classification_report(
twenty_test.target,
predicted,
target_names=twenty_test.target_names))
In [11]:
metrics.confusion_matrix(twenty_test.target, predicted)
Out[11]:
In [12]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[12]:
In [13]:
from sklearn.svm import SVC
text_clf1 = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SVC(kernel='linear', random_state=42)),
])
_ = text_clf1.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[13]:
In [14]:
from sklearn.svm import SVC
text_clf2 = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SVC(kernel='rbf', random_state=42, gamma=0.10, C=10.0)),
])
_ = text_clf2.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[14]:
In [21]:
from sklearn.grid_search import GridSearchCV
parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
# 데이터량이 너무많아 소량으로만 해봄
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])
In [22]:
twenty_train.target_names[gs_clf.predict(['God is love'])]
Out[22]:
In [23]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
print("{name}: {best}".format
(name=param_name, best=best_parameters[param_name]))
score
Out[23]: