In [10]:
from sklearn.datasets import fetch_20newsgroups
news=fetch_20newsgroups(subset='all')
print len(news.data)


18846

In [11]:
print news.data[0]


From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!



In [12]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)


/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vec=CountVectorizer()
X_train=vec.fit_transform(X_train)
x_test=vec.transform(X_test)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-18-e7d5fe3ba214> in <module>()
      1 from sklearn.feature_extraction.text import CountVectorizer
      2 vec=CountVectorizer()
----> 3 X_train=vec.fit_transform(X_train)
      4 x_test=vec.transform(X_test)

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
    837 
    838         vocabulary, X = self._count_vocab(raw_documents,
--> 839                                           self.fixed_vocabulary_)
    840 
    841         if self.binary:

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
    760         for doc in raw_documents:
    761             feature_counter = {}
--> 762             for feature in analyze(doc):
    763                 try:
    764                     feature_idx = vocabulary[feature]

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in <lambda>(doc)
    239 
    240             return lambda doc: self._word_ngrams(
--> 241                 tokenize(preprocess(self.decode(doc))), stop_words)
    242 
    243         else:

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in <lambda>(x)
    205 
    206         if self.lowercase:
--> 207             return lambda x: strip_accents(x.lower())
    208         else:
    209             return strip_accents

/Users/ifeng/anaconda2/lib/python2.7/site-packages/scipy/sparse/base.pyc in __getattr__(self, attr)
    557             return self.getnnz()
    558         else:
--> 559             raise AttributeError(attr + " not found")
    560 
    561     def transpose(self, axes=None, copy=False):

AttributeError: lower not found

In [16]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(X_train,y_train)
y_predict=mnb.predict(X_test)


/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-16-c95ed59eded8> in <module>()
      2 mnb=MultinomialNB()
      3 mnb.fit(X_train,y_train)
----> 4 y_predict=mnb.predict(X_test)

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in predict(self, X)
     63             Predicted target values for X
     64         """
---> 65         jll = self._joint_log_likelihood(X)
     66         return self.classes_[np.argmax(jll, axis=1)]
     67 

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _joint_log_likelihood(self, X)
    705 
    706         X = check_array(X, accept_sparse='csr')
--> 707         return (safe_sparse_dot(X, self.feature_log_prob_.T) +
    708                 self.class_log_prior_)
    709 

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/utils/extmath.pyc in safe_sparse_dot(a, b, dense_output)
    187         return ret
    188     else:
--> 189         return fast_dot(a, b)
    190 
    191 

TypeError: Cannot cast array data from dtype('float64') to dtype('<U32') according to the rule 'safe'

In [17]:
from sklearn.metrics import classification_report
print 'accuracy of Naive Bayes Classifier is ',mnb.score(X_test,y_test)
print Classification_report(y_test,y_predict,target_names=news.target_names)


accuracy of Naive Bayes Classifier is 
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-17-cd97cfb22c14> in <module>()
      1 from sklearn.metrics import classification_report
----> 2 print 'accuracy of Naive Bayes Classifier is ',mnb.score(X_test,y_test)
      3 print Classification_report(y_test,y_predict,target_names=news.target_names)

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/base.pyc in score(self, X, y, sample_weight)
    347         """
    348         from .metrics import accuracy_score
--> 349         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
    350 
    351 

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in predict(self, X)
     63             Predicted target values for X
     64         """
---> 65         jll = self._joint_log_likelihood(X)
     66         return self.classes_[np.argmax(jll, axis=1)]
     67 

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _joint_log_likelihood(self, X)
    705 
    706         X = check_array(X, accept_sparse='csr')
--> 707         return (safe_sparse_dot(X, self.feature_log_prob_.T) +
    708                 self.class_log_prior_)
    709 

/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/utils/extmath.pyc in safe_sparse_dot(a, b, dense_output)
    187         return ret
    188     else:
--> 189         return fast_dot(a, b)
    190 
    191 

TypeError: Cannot cast array data from dtype('float64') to dtype('<U32') according to the rule 'safe'