In [16]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
In [17]:
from sklearn.datasets import fetch_20newsgroups
In [18]:
news = fetch_20newsgroups(subset='all')
In [19]:
news.description
Out[19]:
In [20]:
print len(news.data)
In [21]:
print news.data[0]
In [22]:
print news.target
In [23]:
from sklearn.cross_validation import train_test_split
In [24]:
X_train,X_test,Y_train,Y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33)
In [26]:
#将文本转换为特征向量,再使用贝叶斯模型从训练数据中估计参数
from sklearn.feature_extraction.text import CountVectorizer
In [27]:
vec = CountVectorizer()
In [28]:
X_train = vec.fit_transform(X_train)
In [29]:
X_test = vec.transform(X_test)
In [31]:
#导入朴素贝叶斯模型
from sklearn.naive_bayes import MultinomialNB
In [32]:
mnb = MultinomialNB()
In [33]:
mnb
Out[33]:
In [34]:
mnb.fit(X_train, Y_train)
Out[34]:
In [35]:
y_predict = mnb.predict(X_test)
In [36]:
from sklearn.metrics import classification_report
In [37]:
print 'The Accuracy of NavieBayesClassifier is:',mnb.score(X_test, Y_test)
In [38]:
print classification_report(Y_test, y_predict,target_names=news.target_names)
In [ ]: