notebook.community

Edit and run



In [3]:

    
% matplotlib nbagg
import itertools
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.spatial.distance import euclidean
from sklearn import datasets
import random
random.seed(3222)
np.random.seed(3222)









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__init__.py:13: ShimWarning: The `IPython.kernel` package has been deprecated. You should import from ipykernel or jupyter_client instead.
  "You should import from ipykernel or jupyter_client instead.", ShimWarning)



In [4]:

    
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = datasets.fetch_20newsgroups(subset='train', categories=categories, shuffle=True)



In [5]:

    
twenty_train.keys()









    Out[5]:





['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']



In [7]:

    
from sklearn.feature_extraction.text import CountVectorizer



In [8]:

    
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape









    Out[8]:





(2257, 35788)



In [15]:

    
count_vect.vocabulary_.get(u'algorithm')









    Out[15]:





4690



In [16]:

    
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape









    Out[16]:





(2257, 35788)



In [17]:

    
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape









    Out[17]:





(2257, 35788)



In [19]:

    
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)



In [20]:

    
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))









    



'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics



In [21]:

    
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
])



In [22]:

    
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)



In [24]:

    
twenty_test = datasets.fetch_20newsgroups(subset='test',
     categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)









    Out[24]:





0.83488681757656458



In [25]:

    
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
 ])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)









    Out[25]:





0.9127829560585885



In [26]:

    
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
     target_names=twenty_test.target_names))









    



                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [27]:

    
metrics.confusion_matrix(twenty_test.target, predicted)









    Out[27]:





array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])



In [28]:

    
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
}



In [29]:

    
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)



In [30]:

    
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])



In [31]:

    
twenty_train.target_names[gs_clf.predict(['God is love'])]









    



/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:1: DeprecationWarning: converting an array with ndim > 0 to an index will result in an error in the future
  if __name__ == '__main__':






    Out[31]:





'soc.religion.christian'



In [32]:

    
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
     print("%s: %r" % (param_name, best_parameters[param_name]))

score









    



clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)






    Out[32]:





0.90000000000000002



In [44]:

    
from sklearn.neighbors import KNeighborsClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', KNeighborsClassifier()),
 ])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)









    Out[44]:





0.76564580559254325



In [41]:

    
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__n_neighbors': (5,6,7,8,9),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
     print("%s: %r" % (param_name, best_parameters[param_name]))

score









    



clf__n_neighbors: 5
tfidf__use_idf: True
vect__ngram_range: (1, 1)






    Out[41]:





0.88214443952148869



In [ ]: