In [10]:
from sklearn.datasets import fetch_20newsgroups
news=fetch_20newsgroups(subset='all')
print len(news.data)
18846
In [11]:
print news.data[0]
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu
I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game. PENS RULE!!!
In [12]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vec=CountVectorizer()
X_train=vec.fit_transform(X_train)
x_test=vec.transform(X_test)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-18-e7d5fe3ba214> in <module>()
1 from sklearn.feature_extraction.text import CountVectorizer
2 vec=CountVectorizer()
----> 3 X_train=vec.fit_transform(X_train)
4 x_test=vec.transform(X_test)
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
837
838 vocabulary, X = self._count_vocab(raw_documents,
--> 839 self.fixed_vocabulary_)
840
841 if self.binary:
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
760 for doc in raw_documents:
761 feature_counter = {}
--> 762 for feature in analyze(doc):
763 try:
764 feature_idx = vocabulary[feature]
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in <lambda>(doc)
239
240 return lambda doc: self._word_ngrams(
--> 241 tokenize(preprocess(self.decode(doc))), stop_words)
242
243 else:
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in <lambda>(x)
205
206 if self.lowercase:
--> 207 return lambda x: strip_accents(x.lower())
208 else:
209 return strip_accents
/Users/ifeng/anaconda2/lib/python2.7/site-packages/scipy/sparse/base.pyc in __getattr__(self, attr)
557 return self.getnnz()
558 else:
--> 559 raise AttributeError(attr + " not found")
560
561 def transpose(self, axes=None, copy=False):
AttributeError: lower not found
In [16]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(X_train,y_train)
y_predict=mnb.predict(X_test)
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
DeprecationWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-16-c95ed59eded8> in <module>()
2 mnb=MultinomialNB()
3 mnb.fit(X_train,y_train)
----> 4 y_predict=mnb.predict(X_test)
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in predict(self, X)
63 Predicted target values for X
64 """
---> 65 jll = self._joint_log_likelihood(X)
66 return self.classes_[np.argmax(jll, axis=1)]
67
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _joint_log_likelihood(self, X)
705
706 X = check_array(X, accept_sparse='csr')
--> 707 return (safe_sparse_dot(X, self.feature_log_prob_.T) +
708 self.class_log_prior_)
709
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/utils/extmath.pyc in safe_sparse_dot(a, b, dense_output)
187 return ret
188 else:
--> 189 return fast_dot(a, b)
190
191
TypeError: Cannot cast array data from dtype('float64') to dtype('<U32') according to the rule 'safe'
In [17]:
from sklearn.metrics import classification_report
print 'accuracy of Naive Bayes Classifier is ',mnb.score(X_test,y_test)
print Classification_report(y_test,y_predict,target_names=news.target_names)
accuracy of Naive Bayes Classifier is
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
DeprecationWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-cd97cfb22c14> in <module>()
1 from sklearn.metrics import classification_report
----> 2 print 'accuracy of Naive Bayes Classifier is ',mnb.score(X_test,y_test)
3 print Classification_report(y_test,y_predict,target_names=news.target_names)
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/base.pyc in score(self, X, y, sample_weight)
347 """
348 from .metrics import accuracy_score
--> 349 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
350
351
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in predict(self, X)
63 Predicted target values for X
64 """
---> 65 jll = self._joint_log_likelihood(X)
66 return self.classes_[np.argmax(jll, axis=1)]
67
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _joint_log_likelihood(self, X)
705
706 X = check_array(X, accept_sparse='csr')
--> 707 return (safe_sparse_dot(X, self.feature_log_prob_.T) +
708 self.class_log_prior_)
709
/Users/ifeng/anaconda2/lib/python2.7/site-packages/sklearn/utils/extmath.pyc in safe_sparse_dot(a, b, dense_output)
187 return ret
188 else:
--> 189 return fast_dot(a, b)
190
191
TypeError: Cannot cast array data from dtype('float64') to dtype('<U32') according to the rule 'safe'
Content source: kingzone/kaggle
Similar notebooks: