In [1]:

    
from __future__ import print_function

import numpy as np

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)









    



Sklearn version: 0.18.1

The data

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.



In [2]:

    
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

twenty_train.target_names









    



No handlers could be found for logger "sklearn.datasets.twenty_newsgroups"






    Out[2]:





['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']



In [3]:

    
# Sample data
print(twenty_train.data[0])
print('---------------')
print('Target: ', twenty_train.target[0])









    



Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
---------------
Target:  1



In [4]:

    
# Text preprocessing, tokenizing and filtering of stopwords

from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=5000,
                                stop_words='english')
X_train_counts = tf_vectorizer.fit_transform(twenty_train.data)
X_train_counts.shape









    Out[4]:





(2257, 5000)



In [5]:

    
print(X_train_counts[0,:])
print(X_train_counts[:,0])









    



  (0, 2866)	1
  (0, 238)	1
  (0, 4522)	1
  (0, 2058)	1
  (0, 1123)	1
  (0, 3867)	1
  (0, 1543)	1
  (0, 3385)	1
  (0, 2197)	1
  (0, 1094)	1
  (0, 2643)	1
  (0, 1865)	1
  (0, 2237)	1
  (0, 1795)	2
  (0, 4520)	1
  (0, 2251)	1
  (0, 1090)	1
  (0, 4744)	1
  (0, 3276)	1
  (0, 357)	1
  (0, 3273)	1
  (0, 4299)	1
  (0, 4869)	1
  (0, 2014)	1
  (0, 2550)	1
  (0, 1445)	1
  (232, 0)	2
  (272, 0)	1
  (282, 0)	1
  (400, 0)	1
  (433, 0)	2
  (581, 0)	2
  (588, 0)	1
  (766, 0)	1
  (768, 0)	2
  (837, 0)	3
  (844, 0)	1
  (859, 0)	1
  (880, 0)	1
  (1030, 0)	1
  (1056, 0)	6
  (1057, 0)	2
  (1263, 0)	1
  (1475, 0)	1
  (1665, 0)	16
  (1795, 0)	1
  (1802, 0)	1
  (1833, 0)	1
  (1890, 0)	2
  (2069, 0)	1
  (2144, 0)	1



In [6]:

    
#From occurrences to frequencies
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tf = tfidf_transformer.transform(X_train_counts)
X_train_tf.shape









    Out[6]:





(2257, 5000)



In [7]:

    
print(X_train_tf[0,:])
print(X_train_tf[:,0])









    



  (0, 1445)	0.0998496101737
  (0, 2550)	0.0920875619201
  (0, 2014)	0.10905059472
  (0, 4869)	0.112409159775
  (0, 4299)	0.172232378831
  (0, 3273)	0.189497984618
  (0, 357)	0.196147304589
  (0, 3276)	0.239358101611
  (0, 4744)	0.242697172074
  (0, 1090)	0.185367646905
  (0, 2251)	0.281517460204
  (0, 4520)	0.239358101611
  (0, 1795)	0.326673936513
  (0, 2237)	0.217882788689
  (0, 1865)	0.182356290661
  (0, 2643)	0.0944312658437
  (0, 1094)	0.250397930473
  (0, 2197)	0.225991796704
  (0, 3385)	0.272954303671
  (0, 1543)	0.163780615995
  (0, 3867)	0.165608347231
  (0, 1123)	0.157610927262
  (0, 2058)	0.144807482284
  (0, 4522)	0.126533637604
  (0, 238)	0.170069829145
  (0, 2866)	0.190380209723
  (232, 0)	0.162673301572
  (272, 0)	0.0396045882998
  (282, 0)	0.0830143471237
  (400, 0)	0.00527736458963
  (433, 0)	0.00596499373539
  (581, 0)	0.150704657006
  (588, 0)	0.154296833105
  (766, 0)	0.126956846998
  (768, 0)	0.0117078298784
  (837, 0)	0.334685959895
  (844, 0)	0.207167396703
  (859, 0)	0.216506134034
  (880, 0)	0.0133502362916
  (1030, 0)	0.278741714593
  (1056, 0)	0.212612262278
  (1057, 0)	0.139865527738
  (1263, 0)	0.0889107355711
  (1475, 0)	0.275148224162
  (1665, 0)	0.22386057664
  (1795, 0)	0.0911799104224
  (1802, 0)	0.0165319212251
  (1833, 0)	0.11551299395
  (1890, 0)	0.0079616312192
  (2069, 0)	0.10844175005
  (2144, 0)	0.114983457384

First basic model



In [8]:

    
from sklearn.naive_bayes import MultinomialNB

# Define and fit in one line
clf = MultinomialNB().fit(X_train_tf, twenty_train.target)



In [9]:

    
#Score test data

# Read test data
twenty_test = fetch_20newsgroups(subset='test',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

# Transform text to counts
X_test_counts = tf_vectorizer.transform(twenty_test.data)

# tf-idf transformation
X_test_tf = tfidf_transformer.transform(X_test_counts)

# Prediction
predicted = clf.predict(X_test_tf)

# Accuracy
from sklearn.metrics import accuracy_score
print('Accuracy test: ', accuracy_score(twenty_test.target, predicted))









    



Accuracy test:  0.798934753662



In [10]:

    
# Score 2 new docs
docs_new = ['God is love', 'OpenGL on the GPU is fast']

X_new_counts = tf_vectorizer.transform(docs_new)

X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))









    



'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics



In [ ]:

Build a pipeline



In [11]:

    
#Define the pipeline

from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

# Fit all the pipeline
text_clf.fit(twenty_train.data, twenty_train.target)









    Out[11]:





Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=5000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
    ...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])



In [12]:

    
#Evaluate test data
twenty_test = fetch_20newsgroups(subset='test',
                    remove=('headers', 'footers', 'quotes'),
                    categories=categories, 
                    shuffle=True, random_state=42)

predicted = text_clf.predict(twenty_test.data)

np.mean(predicted == twenty_test.target)









    Out[12]:





0.79893475366178424

Change classifier in the pipeline



In [13]:

    
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
                    ])
#Fit
_ = text_clf.fit(twenty_train.data, twenty_train.target)

# Predict
predicted = text_clf.predict(twenty_test.data)

# Evaluate accuracy
np.mean(predicted == twenty_test.target)









    Out[13]:





0.80692410119840208



In [ ]:

Other classifier



In [14]:

    
from sklearn import svm
text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', svm.LinearSVC()),
                    ])

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)









    Out[14]:





0.80892143808255657



In [ ]:

Optimize a pipeline



In [15]:

    
from sklearn.model_selection import RandomizedSearchCV

# Define estimator. No parameters of the search
clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2)),
                ('tfidf', TfidfTransformer()),
                ('clf', svm.LinearSVC()),
                ])

# Specify parameters and distributions to sample from
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_dist = {"vect__max_features": [1000, 2500, 5000, 7500, 10000, None], 
              "vect__stop_words": ['english', None], 
              "clf__C": [.1, .5, 1., 1.5, 2.]}

# Define randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search)

# Run the randomized search
random_search.fit(twenty_train.data, twenty_train.target)

print("Done!")









    



Done!



In [16]:

    
# Load dictionary of search results to a Pandas dataframe
import pandas as pd

df_cv_results = pd.DataFrame.from_dict(random_search.cv_results_)
df_cv_results









    Out[16]:






  
    
      
      mean_fit_time
      mean_score_time
      mean_test_score
      mean_train_score
      param_clf__C
      param_vect__max_features
      param_vect__stop_words
      params
      rank_test_score
      split0_test_score
      split0_train_score
      split1_test_score
      split1_train_score
      split2_test_score
      split2_train_score
      std_fit_time
      std_score_time
      std_test_score
      std_train_score
    
  
  
    
      0
      0.385353
      0.161182
      0.865751
      0.982057
      0.5
      7500
      english
      {u'clf__C': 0.5, u'vect__max_features': 7500, ...
      2
      0.881806
      0.982048
      0.855246
      0.984043
      0.860186
      0.980080
      0.007650
      0.008654
      0.011538
      0.001618
    
    
      1
      0.394397
      0.162430
      0.818343
      0.982499
      2
      2500
      english
      {u'clf__C': 2.0, u'vect__max_features': 2500, ...
      9
      0.819389
      0.983378
      0.816733
      0.982713
      0.818908
      0.981408
      0.003227
      0.009286
      0.001156
      0.000818
    
    
      2
      0.413017
      0.162230
      0.865308
      0.982499
      1
      10000
      None
      {u'clf__C': 1.0, u'vect__max_features': 10000,...
      3
      0.875166
      0.982048
      0.853918
      0.984043
      0.866844
      0.981408
      0.009257
      0.008585
      0.008746
      0.001122
    
    
      3
      0.384515
      0.162045
      0.828977
      0.951043
      0.1
      None
      None
      {u'clf__C': 0.1, u'vect__max_features': None, ...
      7
      0.843293
      0.954122
      0.816733
      0.951463
      0.826897
      0.947543
      0.007866
      0.009316
      0.010947
      0.002702
    
    
      4
      0.383914
      0.163052
      0.826318
      0.947499
      0.1
      7500
      None
      {u'clf__C': 0.1, u'vect__max_features': 7500, ...
      8
      0.837981
      0.952128
      0.815405
      0.947473
      0.825566
      0.942895
      0.008919
      0.007929
      0.009236
      0.003769
    
    
      5
      0.391355
      0.161800
      0.853345
      0.980505
      0.5
      5000
      None
      {u'clf__C': 0.5, u'vect__max_features': 5000, ...
      6
      0.861886
      0.979388
      0.841965
      0.982048
      0.856192
      0.980080
      0.007668
      0.008330
      0.008381
      0.001127
    
    
      6
      0.437481
      0.162024
      0.858219
      0.983164
      2
      None
      None
      {u'clf__C': 2.0, u'vect__max_features': None, ...
      5
      0.864542
      0.984043
      0.848606
      0.984043
      0.861518
      0.981408
      0.003648
      0.008307
      0.006913
      0.001242
    
    
      7
      0.375346
      0.163447
      0.816128
      0.932875
      0.1
      2500
      None
      {u'clf__C': 0.1, u'vect__max_features': 2500, ...
      10
      0.833997
      0.932846
      0.800797
      0.931516
      0.813582
      0.934263
      0.008025
      0.008995
      0.013679
      0.001122
    
    
      8
      0.413922
      0.159846
      0.864865
      0.983607
      2
      10000
      english
      {u'clf__C': 2.0, u'vect__max_features': 10000,...
      4
      0.873838
      0.984707
      0.852590
      0.984043
      0.868176
      0.982072
      0.007230
      0.008104
      0.008988
      0.001119
    
    
      9
      0.395163
      0.159460
      0.868852
      0.983164
      1
      10000
      english
      {u'clf__C': 1.0, u'vect__max_features': 10000,...
      1
      0.880478
      0.984043
      0.853918
      0.984043
      0.872170
      0.981408
      0.008901
      0.008428
      0.011098
      0.001242



In [17]:

    
# Score & evaluate test data using the best estimator

text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', svm.LinearSVC(C=1.5)),
                    ])

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)









    Out[17]:





0.81424766977363516



In [ ]:

Aditional metrics for multiclass classification



In [18]:

    
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, 
                                    predicted,
                                    target_names=twenty_test.target_names))









    



                        precision    recall  f1-score   support

           alt.atheism       0.76      0.61      0.68       319
         comp.graphics       0.82      0.92      0.87       389
               sci.med       0.88      0.85      0.86       396
soc.religion.christian       0.78      0.84      0.81       398

           avg / total       0.81      0.81      0.81      1502



In [19]:

    
metrics.confusion_matrix(twenty_test.target, predicted)









    Out[19]:





array([[196,  22,  24,  77],
       [ 16, 356,  14,   3],
       [ 14,  32, 337,  13],
       [ 32,  22,  10, 334]])

	mean_fit_time	mean_score_time	mean_test_score	mean_train_score	param_clf__C	param_vect__max_features	param_vect__stop_words	params	rank_test_score	split0_test_score	split0_train_score	split1_test_score	split1_train_score	split2_test_score	split2_train_score	std_fit_time	std_score_time	std_test_score	std_train_score
0	0.385353	0.161182	0.865751	0.982057	0.5	7500	english	{u'clf__C': 0.5, u'vect__max_features': 7500, ...	2	0.881806	0.982048	0.855246	0.984043	0.860186	0.980080	0.007650	0.008654	0.011538	0.001618
1	0.394397	0.162430	0.818343	0.982499	2	2500	english	{u'clf__C': 2.0, u'vect__max_features': 2500, ...	9	0.819389	0.983378	0.816733	0.982713	0.818908	0.981408	0.003227	0.009286	0.001156	0.000818
2	0.413017	0.162230	0.865308	0.982499	1	10000	None	{u'clf__C': 1.0, u'vect__max_features': 10000,...	3	0.875166	0.982048	0.853918	0.984043	0.866844	0.981408	0.009257	0.008585	0.008746	0.001122
3	0.384515	0.162045	0.828977	0.951043	0.1	None	None	{u'clf__C': 0.1, u'vect__max_features': None, ...	7	0.843293	0.954122	0.816733	0.951463	0.826897	0.947543	0.007866	0.009316	0.010947	0.002702
4	0.383914	0.163052	0.826318	0.947499	0.1	7500	None	{u'clf__C': 0.1, u'vect__max_features': 7500, ...	8	0.837981	0.952128	0.815405	0.947473	0.825566	0.942895	0.008919	0.007929	0.009236	0.003769
5	0.391355	0.161800	0.853345	0.980505	0.5	5000	None	{u'clf__C': 0.5, u'vect__max_features': 5000, ...	6	0.861886	0.979388	0.841965	0.982048	0.856192	0.980080	0.007668	0.008330	0.008381	0.001127
6	0.437481	0.162024	0.858219	0.983164	2	None	None	{u'clf__C': 2.0, u'vect__max_features': None, ...	5	0.864542	0.984043	0.848606	0.984043	0.861518	0.981408	0.003648	0.008307	0.006913	0.001242
7	0.375346	0.163447	0.816128	0.932875	0.1	2500	None	{u'clf__C': 0.1, u'vect__max_features': 2500, ...	10	0.833997	0.932846	0.800797	0.931516	0.813582	0.934263	0.008025	0.008995	0.013679	0.001122
8	0.413922	0.159846	0.864865	0.983607	2	10000	english	{u'clf__C': 2.0, u'vect__max_features': 10000,...	4	0.873838	0.984707	0.852590	0.984043	0.868176	0.982072	0.007230	0.008104	0.008988	0.001119
9	0.395163	0.159460	0.868852	0.983164	1	10000	english	{u'clf__C': 1.0, u'vect__max_features': 10000,...	1	0.880478	0.984043	0.853918	0.984043	0.872170	0.981408	0.008901	0.008428	0.011098	0.001242