In [1]:
from __future__ import print_function

import numpy as np

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)


Sklearn version: 0.18.1

The data

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.


In [2]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

twenty_train.target_names


No handlers could be found for logger "sklearn.datasets.twenty_newsgroups"
Out[2]:
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [3]:
# Sample data
print(twenty_train.data[0])
print('---------------')
print('Target: ', twenty_train.target[0])


Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
---------------
Target:  1

In [4]:
# Text preprocessing, tokenizing and filtering of stopwords

from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=5000,
                                stop_words='english')
X_train_counts = tf_vectorizer.fit_transform(twenty_train.data)
X_train_counts.shape


Out[4]:
(2257, 5000)

In [5]:
print(X_train_counts[0,:])
print(X_train_counts[:,0])


  (0, 2866)	1
  (0, 238)	1
  (0, 4522)	1
  (0, 2058)	1
  (0, 1123)	1
  (0, 3867)	1
  (0, 1543)	1
  (0, 3385)	1
  (0, 2197)	1
  (0, 1094)	1
  (0, 2643)	1
  (0, 1865)	1
  (0, 2237)	1
  (0, 1795)	2
  (0, 4520)	1
  (0, 2251)	1
  (0, 1090)	1
  (0, 4744)	1
  (0, 3276)	1
  (0, 357)	1
  (0, 3273)	1
  (0, 4299)	1
  (0, 4869)	1
  (0, 2014)	1
  (0, 2550)	1
  (0, 1445)	1
  (232, 0)	2
  (272, 0)	1
  (282, 0)	1
  (400, 0)	1
  (433, 0)	2
  (581, 0)	2
  (588, 0)	1
  (766, 0)	1
  (768, 0)	2
  (837, 0)	3
  (844, 0)	1
  (859, 0)	1
  (880, 0)	1
  (1030, 0)	1
  (1056, 0)	6
  (1057, 0)	2
  (1263, 0)	1
  (1475, 0)	1
  (1665, 0)	16
  (1795, 0)	1
  (1802, 0)	1
  (1833, 0)	1
  (1890, 0)	2
  (2069, 0)	1
  (2144, 0)	1

In [6]:
#From occurrences to frequencies
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tf = tfidf_transformer.transform(X_train_counts)
X_train_tf.shape


Out[6]:
(2257, 5000)

In [7]:
print(X_train_tf[0,:])
print(X_train_tf[:,0])


  (0, 1445)	0.0998496101737
  (0, 2550)	0.0920875619201
  (0, 2014)	0.10905059472
  (0, 4869)	0.112409159775
  (0, 4299)	0.172232378831
  (0, 3273)	0.189497984618
  (0, 357)	0.196147304589
  (0, 3276)	0.239358101611
  (0, 4744)	0.242697172074
  (0, 1090)	0.185367646905
  (0, 2251)	0.281517460204
  (0, 4520)	0.239358101611
  (0, 1795)	0.326673936513
  (0, 2237)	0.217882788689
  (0, 1865)	0.182356290661
  (0, 2643)	0.0944312658437
  (0, 1094)	0.250397930473
  (0, 2197)	0.225991796704
  (0, 3385)	0.272954303671
  (0, 1543)	0.163780615995
  (0, 3867)	0.165608347231
  (0, 1123)	0.157610927262
  (0, 2058)	0.144807482284
  (0, 4522)	0.126533637604
  (0, 238)	0.170069829145
  (0, 2866)	0.190380209723
  (232, 0)	0.162673301572
  (272, 0)	0.0396045882998
  (282, 0)	0.0830143471237
  (400, 0)	0.00527736458963
  (433, 0)	0.00596499373539
  (581, 0)	0.150704657006
  (588, 0)	0.154296833105
  (766, 0)	0.126956846998
  (768, 0)	0.0117078298784
  (837, 0)	0.334685959895
  (844, 0)	0.207167396703
  (859, 0)	0.216506134034
  (880, 0)	0.0133502362916
  (1030, 0)	0.278741714593
  (1056, 0)	0.212612262278
  (1057, 0)	0.139865527738
  (1263, 0)	0.0889107355711
  (1475, 0)	0.275148224162
  (1665, 0)	0.22386057664
  (1795, 0)	0.0911799104224
  (1802, 0)	0.0165319212251
  (1833, 0)	0.11551299395
  (1890, 0)	0.0079616312192
  (2069, 0)	0.10844175005
  (2144, 0)	0.114983457384

First basic model


In [8]:
from sklearn.naive_bayes import MultinomialNB

# Define and fit in one line
clf = MultinomialNB().fit(X_train_tf, twenty_train.target)

In [9]:
#Score test data

# Read test data
twenty_test = fetch_20newsgroups(subset='test',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

# Transform text to counts
X_test_counts = tf_vectorizer.transform(twenty_test.data)

# tf-idf transformation
X_test_tf = tfidf_transformer.transform(X_test_counts)

# Prediction
predicted = clf.predict(X_test_tf)

# Accuracy
from sklearn.metrics import accuracy_score
print('Accuracy test: ', accuracy_score(twenty_test.target, predicted))


Accuracy test:  0.798934753662

In [10]:
# Score 2 new docs
docs_new = ['God is love', 'OpenGL on the GPU is fast']

X_new_counts = tf_vectorizer.transform(docs_new)

X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))


'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics

In [ ]:

Build a pipeline


In [11]:
#Define the pipeline

from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

# Fit all the pipeline
text_clf.fit(twenty_train.data, twenty_train.target)


Out[11]:
Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=5000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
    ...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [12]:
#Evaluate test data
twenty_test = fetch_20newsgroups(subset='test',
                    remove=('headers', 'footers', 'quotes'),
                    categories=categories, 
                    shuffle=True, random_state=42)

predicted = text_clf.predict(twenty_test.data)

np.mean(predicted == twenty_test.target)


Out[12]:
0.79893475366178424

Change classifier in the pipeline


In [13]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
                    ])
#Fit
_ = text_clf.fit(twenty_train.data, twenty_train.target)

# Predict
predicted = text_clf.predict(twenty_test.data)

# Evaluate accuracy
np.mean(predicted == twenty_test.target)


Out[13]:
0.80692410119840208

In [ ]:

Other classifier


In [14]:
from sklearn import svm
text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', svm.LinearSVC()),
                    ])

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)


Out[14]:
0.80892143808255657

In [ ]:

Optimize a pipeline


In [15]:
from sklearn.model_selection import RandomizedSearchCV

# Define estimator. No parameters of the search
clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2)),
                ('tfidf', TfidfTransformer()),
                ('clf', svm.LinearSVC()),
                ])

# Specify parameters and distributions to sample from
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_dist = {"vect__max_features": [1000, 2500, 5000, 7500, 10000, None], 
              "vect__stop_words": ['english', None], 
              "clf__C": [.1, .5, 1., 1.5, 2.]}

# Define randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search)

# Run the randomized search
random_search.fit(twenty_train.data, twenty_train.target)

print("Done!")


Done!

In [16]:
# Load dictionary of search results to a Pandas dataframe
import pandas as pd

df_cv_results = pd.DataFrame.from_dict(random_search.cv_results_)
df_cv_results


Out[16]:
mean_fit_time mean_score_time mean_test_score mean_train_score param_clf__C param_vect__max_features param_vect__stop_words params rank_test_score split0_test_score split0_train_score split1_test_score split1_train_score split2_test_score split2_train_score std_fit_time std_score_time std_test_score std_train_score
0 0.385353 0.161182 0.865751 0.982057 0.5 7500 english {u'clf__C': 0.5, u'vect__max_features': 7500, ... 2 0.881806 0.982048 0.855246 0.984043 0.860186 0.980080 0.007650 0.008654 0.011538 0.001618
1 0.394397 0.162430 0.818343 0.982499 2 2500 english {u'clf__C': 2.0, u'vect__max_features': 2500, ... 9 0.819389 0.983378 0.816733 0.982713 0.818908 0.981408 0.003227 0.009286 0.001156 0.000818
2 0.413017 0.162230 0.865308 0.982499 1 10000 None {u'clf__C': 1.0, u'vect__max_features': 10000,... 3 0.875166 0.982048 0.853918 0.984043 0.866844 0.981408 0.009257 0.008585 0.008746 0.001122
3 0.384515 0.162045 0.828977 0.951043 0.1 None None {u'clf__C': 0.1, u'vect__max_features': None, ... 7 0.843293 0.954122 0.816733 0.951463 0.826897 0.947543 0.007866 0.009316 0.010947 0.002702
4 0.383914 0.163052 0.826318 0.947499 0.1 7500 None {u'clf__C': 0.1, u'vect__max_features': 7500, ... 8 0.837981 0.952128 0.815405 0.947473 0.825566 0.942895 0.008919 0.007929 0.009236 0.003769
5 0.391355 0.161800 0.853345 0.980505 0.5 5000 None {u'clf__C': 0.5, u'vect__max_features': 5000, ... 6 0.861886 0.979388 0.841965 0.982048 0.856192 0.980080 0.007668 0.008330 0.008381 0.001127
6 0.437481 0.162024 0.858219 0.983164 2 None None {u'clf__C': 2.0, u'vect__max_features': None, ... 5 0.864542 0.984043 0.848606 0.984043 0.861518 0.981408 0.003648 0.008307 0.006913 0.001242
7 0.375346 0.163447 0.816128 0.932875 0.1 2500 None {u'clf__C': 0.1, u'vect__max_features': 2500, ... 10 0.833997 0.932846 0.800797 0.931516 0.813582 0.934263 0.008025 0.008995 0.013679 0.001122
8 0.413922 0.159846 0.864865 0.983607 2 10000 english {u'clf__C': 2.0, u'vect__max_features': 10000,... 4 0.873838 0.984707 0.852590 0.984043 0.868176 0.982072 0.007230 0.008104 0.008988 0.001119
9 0.395163 0.159460 0.868852 0.983164 1 10000 english {u'clf__C': 1.0, u'vect__max_features': 10000,... 1 0.880478 0.984043 0.853918 0.984043 0.872170 0.981408 0.008901 0.008428 0.011098 0.001242

In [17]:
# Score & evaluate test data using the best estimator

text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', svm.LinearSVC(C=1.5)),
                    ])

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)


Out[17]:
0.81424766977363516

In [ ]:

Aditional metrics for multiclass classification


In [18]:
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, 
                                    predicted,
                                    target_names=twenty_test.target_names))


                        precision    recall  f1-score   support

           alt.atheism       0.76      0.61      0.68       319
         comp.graphics       0.82      0.92      0.87       389
               sci.med       0.88      0.85      0.86       396
soc.religion.christian       0.78      0.84      0.81       398

           avg / total       0.81      0.81      0.81      1502


In [19]:
metrics.confusion_matrix(twenty_test.target, predicted)


Out[19]:
array([[196,  22,  24,  77],
       [ 16, 356,  14,   3],
       [ 14,  32, 337,  13],
       [ 32,  22,  10, 334]])