In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
fetch_subset = lambda subset: fetch_20newsgroups(
    subset=subset, categories=categories,
    shuffle=True, random_state=42,
    remove=('headers', 'footers', 'quotes'))
train = fetch_subset('train')
test = fetch_subset('test')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipeline = Pipeline([('vec', vec), ('clf', clf)])
pipeline.fit(train['data'], train['target'])


Out[2]:
Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

In [12]:
import eli5
from eli5 import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles, fields

# print(format_as_text(explain_weights(clf, vec, target_names=train['target_names'])))

In [4]:
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())



In [5]:
eli5.show_weights(clf, vec=vec, target_names=train['target_names'], horizontal_layout=False)


Out[5]:

y=alt.atheism top features

Weight? Feature
+18.117 atheism
+16.558 atheists
+14.393 religion
+14.380 bobby
+14.325 matthew
+13.389 motto
+13.215 atheist
+13.010 islam
+12.800 nanci
+12.216 enviroleague
+12.109 loans
+11.672 satan
+11.488 posting
+11.173 enlightening
+11.108 natural
… 6382 more positive …
… 20478 more negative …
-11.259 fake
-11.526 order
-12.169 christian
-12.253 hudson
-18.551 space

y=comp.graphics top features

Weight? Feature
+25.897 graphics
+18.957 image
+17.298 computer
+16.843 3d
+16.190 file
+14.020 points
+13.269 sgi
+13.180 42
+12.428 hi
+11.835 3do
+11.175 animation
+11.146 using
+10.877 code
+10.792 package
+10.681 video
+10.585 screen
+10.571 sphere
+10.570 68070
+10.553 files
… 7893 more positive …
… 18967 more negative …
-18.127 space

y=sci.space top features

Weight? Feature
+35.983 space
+17.907 orbit
+15.269 nasa
+15.173 launch
+13.235 spacecraft
+12.872 mars
+12.369 nick
+12.117 moon
+12.064 allen
+11.800 shuttle
+11.799 dc
+10.934 sci
+10.726 solar
+10.716 earth
… 10083 more positive …
… 16777 more negative …
-10.976 file
-11.109 wrong
-11.886 image
-12.109 religion
-13.500 god
-18.002 graphics

y=talk.religion.misc top features

Weight? Feature
+19.215 christian
+16.667 blood
+14.907 fbi
+14.185 christians
+12.783 hudson
+12.746 order
+12.338 christ
+12.126 ekr
+11.972 terrorist
+11.608 koresh
+11.549 dead
+11.185 cult
… 6600 more positive …
… 20260 more negative …
-11.206 anyone
-11.567 could
-11.699 get
-12.212 thanks
-12.230 edu
-12.319 it
-13.026 atheists
-17.289 space

In [6]:
show_html_expl(
    explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']),
    force_weights=False, horizontal_layout=True)


Explained as: linear model

y=alt.atheism (probability 0.000, score -16.171) top features

Contribution? Feature
-1.394 <BIAS>
-14.777 Highlighted in text (sum)

hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)

y=comp.graphics (probability 0.999, score 8.616) top features

Contribution? Feature
+9.631 Highlighted in text (sum)
-1.015 <BIAS>

hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)

y=sci.space (probability 0.001, score -6.824) top features

Contribution? Feature
-1.016 <BIAS>
-5.808 Highlighted in text (sum)

hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)

y=talk.religion.misc (probability 0.000, score -11.885) top features

Contribution? Feature
-1.019 <BIAS>
-10.865 Highlighted in text (sum)

hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)

dense_multitarget=True is supported for prediction explanations too, and shows just the top prediction highlighting.


In [7]:
show_html_expl(explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']),
               force_weights=True)


Explained as: linear model

y=alt.atheism (probability 0.000, score -16.171) top features y=comp.graphics (probability 0.999, score 8.616) top features y=sci.space (probability 0.001, score -6.824) top features y=talk.religion.misc (probability 0.000, score -11.885) top features
Contribution? Feature
+0.889 some
+0.539 much
+0.278 is
+0.266 which
+0.225 designer
+0.218 it
+0.161 most
+0.107 trying
-0.005 interior
-0.009 has
-0.053 likes
-0.058 sophisticated
-0.060 any
-0.111 my
-0.171 find
-0.216 there
-0.244 for
-0.276 more
-0.317 suggestion
-0.371 here
-0.385 to
-0.390 and
-0.397 better
-0.400 he
-0.407 how
-0.432 am
-0.462 from
-0.509 features
-0.606 where
-0.615 pc
-0.649 hi
-0.664 on
-0.698 the
-0.755 friend
-0.800 help
-0.854 costs
-0.892 looking
-0.931 buy
-1.269 graphics
-1.394 <BIAS>
-3.451 software
Contribution? Feature
+3.120 graphics
+2.661 software
+1.707 hi
+1.180 looking
+1.127 buy
+0.906 features
+0.850 pc
+0.672 help
+0.530 any
+0.520 it
+0.474 on
+0.459 find
+0.347 am
+0.346 where
+0.314 has
+0.311 there
+0.233 for
+0.225 which
+0.078 from
+0.046 friend
+0.034 trying
+0.028 interior
+0.005 costs
-0.018 likes
-0.019 better
-0.026 my
-0.058 here
-0.103 designer
-0.109 sophisticated
-0.155 and
-0.172 is
-0.217 some
-0.244 how
-0.251 to
-0.254 most
-0.370 more
-0.534 much
-0.760 suggestion
-0.863 the
-1.015 <BIAS>
-2.388 he
Contribution? Feature
+0.870 costs
+0.637 buy
+0.606 software
+0.500 most
+0.393 the
+0.287 on
+0.281 some
+0.259 better
+0.249 likes
+0.241 sophisticated
+0.213 more
+0.183 much
+0.176 friend
+0.122 from
+0.118 there
+0.049 here
+0.028 and
+0.018 to
+0.016 how
-0.013 interior
-0.041 designer
-0.051 where
-0.090 it
-0.094 has
-0.203 suggestion
-0.234 for
-0.295 help
-0.316 am
-0.327 trying
-0.356 any
-0.418 which
-0.506 pc
-0.519 find
-0.529 my
-0.567 looking
-0.654 features
-0.961 hi
-1.016 <BIAS>
-1.140 is
-1.568 he
-2.169 graphics
Contribution? Feature
+2.181 he
+0.528 my
+0.481 more
+0.345 and
+0.313 friend
+0.287 suggestion
+0.228 trying
+0.137 find
+0.110 here
+0.104 from
-0.001 interior
-0.028 designer
-0.032 likes
-0.044 how
-0.052 am
-0.057 help
-0.139 sophisticated
-0.140 better
-0.202 to
-0.204 the
-0.218 where
-0.231 which
-0.236 features
-0.278 buy
-0.394 there
-0.399 costs
-0.473 pc
-0.500 much
-0.514 looking
-0.556 for
-0.621 is
-0.692 any
-0.699 has
-0.732 most
-1.006 on
-1.019 <BIAS>
-1.098 hi
-1.212 some
-1.218 it
-1.295 graphics
-2.308 software

y=alt.atheism (probability 0.000, score -16.171) top features

Contribution? Feature
-1.394 <BIAS>
-14.777 Highlighted in text (sum)

hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)

y=comp.graphics (probability 0.999, score 8.616) top features

Contribution? Feature
+9.631 Highlighted in text (sum)
-1.015 <BIAS>

hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)

y=sci.space (probability 0.001, score -6.824) top features

Contribution? Feature
-1.016 <BIAS>
-5.808 Highlighted in text (sum)

hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)

y=talk.religion.misc (probability 0.000, score -11.885) top features

Contribution? Feature
-1.019 <BIAS>
-10.865 Highlighted in text (sum)

hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)

We can hide weights by passing force_weights=False (they still will be shown if it's impossible to highlight text)


In [8]:
show_html_expl(explain_prediction(clf, test['data'][4], vec, target_names=train['target_names']), force_weights=False)


Explained as: linear model

y=alt.atheism (probability 0.001, score -7.516) top features

Contribution? Feature
-1.394 <BIAS>
-6.122 Highlighted in text (sum)

i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?

y=comp.graphics (probability 0.999, score 6.432) top features

Contribution? Feature
+7.447 Highlighted in text (sum)
-1.015 <BIAS>

i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?

y=sci.space (probability 0.000, score -10.113) top features

Contribution? Feature
-1.016 <BIAS>
-9.098 Highlighted in text (sum)

i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?

y=talk.religion.misc (probability 0.000, score -11.681) top features

Contribution? Feature
-1.019 <BIAS>
-10.662 Highlighted in text (sum)

i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?

Show explanations for the winning class for first 10 documents from test data


In [9]:
import numpy as np
for doc in test['data'][:10]:
    expl = explain_prediction(clf, doc, vec, target_names=train['target_names'], top_targets=1)
    show_html_expl(expl, force_weights=False)


Explained as: linear model

y=sci.space (probability 0.979, score 5.057) top features

Contribution? Feature
+6.073 Highlighted in text (sum)
-1.016 <BIAS>

trry the skywatch project in arizona.

Explained as: linear model

y=comp.graphics (probability 0.999, score 6.193) top features

Contribution? Feature
+7.208 Highlighted in text (sum)
-1.015 <BIAS>

the vatican library recently made a tour of the us. can anyone help me in finding a ftp site where this collection is available.

Explained as: linear model

y=comp.graphics (probability 0.999, score 8.616) top features

Contribution? Feature
+9.631 Highlighted in text (sum)
-1.015 <BIAS>

hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)

Explained as: linear model

y=comp.graphics (probability 0.994, score 3.280) top features

Contribution? Feature
+4.294 Highlighted in text (sum)
-1.015 <BIAS>

rfd request for discussion for the open telematic group otg i have proposed the forming of a consortium/task force for the promotion of naplps/jpeg, fif to openly discuss ways, method, procedures,algorythms, applications, implementation, extensions of naplps/jpeg standards. these standards should facilitate the creation of real_time online applications that make use of voice, video, telecommuting, hires graphics, conferencing, distant learning, online order entry, fax,in addition these dicussion would assist all to better understand how sgml, cals, oda, mime, oodbms, jpeg, mpeg, fractals, sql, cdrom, cdromxa, kodak photocd, tcl, v.fast, and eia/tia562, can best be incorporated and implemented to develop telematic/multimedia applications. we want to be able to support dos, unix, mac, windows, nt, os/2 platforms. it is our hope that individuals, developers, corporations, universities, r & d labs would join in in supporting such an endeavor. this would be a not_for_profit group with bylaws and charter. already many corporations have decided to support otg (open telematic group) so do not delay joining if you are a developer an rfd has been posted to form a usenet newsgroup and a faq will soon be be composed to start promulgating what is known on the subject. if you would like to be added to the maillist send email or mail to the address below. this group would publish an electronic quarterly naplps/jpeg newsletter as well as a hardcopy version. we urge all who wants to see cmcs hires based applications & the naplps/jpeg g r o w, decide to join and mutually benefit from this not-for_profit endeavor. note: telematic has been defined by mr. james martin as the marriage of voice, video, hi-res graphics, fax, ivr, music over telephone lines/lan. if you would like to get involve write to me at: img inter-multimedia group| internet: epimntl@world.std.com p.o. box 95901 | ed.pimentel@gisatl.fidonet.org atlanta, georgia, us | cis : 70611,3703 | fidonet : 1:133/407 | bbs : +1-404-985-1198 zyxel 14.4k

Explained as: linear model

y=comp.graphics (probability 0.999, score 6.432) top features

Contribution? Feature
+7.447 Highlighted in text (sum)
-1.015 <BIAS>

i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?

Explained as: linear model

y=comp.graphics (probability 0.643, score 0.494) top features

Contribution? Feature
+1.509 Highlighted in text (sum)
-1.015 <BIAS>

i'm also interested in such a program. but most of all i'd like to know wich program is able to convert gif or pcx to dxf !!! when i have this program, i can scan pictures and frase (or something like that !) them. this will be beyond the limit !!!

Explained as: linear model

y=comp.graphics (probability 0.412, score -0.996) top features

Contribution? Feature
+0.019 Highlighted in text (sum)
-1.015 <BIAS>

or how about: "end light pollution now!!" your banner would have no effect on its subject, but my banner would.

Explained as: linear model

y=sci.space (probability 1.000, score 10.393) top features

Contribution? Feature
+11.409 Highlighted in text (sum)
-1.016 <BIAS>

: while i'm sure sagan considers it sacrilegious, that wouldn't be : because of his doubtfull credibility as an astronomer. modern, : ground-based, visible light astronomy (what these proposed : orbiting billboards would upset) is already a dying field: the : opacity and distortions caused by the atmosphere itself have : driven most of the field to use radio, far infrared or space-based : telescopes. hardly. the keck telescope in hawaii has taken its first pictures; they're nearly as good as hubble for a tiny fraction of the cost. : in any case, a bright point of light passing through : the field doesn't ruin observations. if that were the case, the : thousands of existing satellites would have already done so (satelliets : might not seem so bright to the eyes, but as far as astronomy is concerned, : they are extremely bright.) i believe that this orbiting space junk will be far brighter still; more like the full moon. the moon upsets deep-sky observation all over the sky (and not just looking at it) because of scattered light. this is a known problem, but of course two weeks out of every four are ok. what happens when this billboard circles every 90 minutes? what would be a good time then? : frank crary : cu boulder

Explained as: linear model

y=alt.atheism (probability 0.991, score 8.925) top features

Contribution? Feature
+10.319 Highlighted in text (sum)
-1.394 <BIAS>

not if you show that these hypothetical atheists are gullible, excitable and easily led from some concrete cause. in that case we would also have to discuss if that concrete cause, rather than atheism, was the factor that caused their subsequent behaviour.

Explained as: linear model

y=sci.space (probability 0.850, score -0.580) top features

Contribution? Feature
+0.436 Highlighted in text (sum)
-1.016 <BIAS>

picture our universe floating like a log in a river. as the log floats down the river, it occasionally strikes rocks, the bank, the bottom, other logs. when this collission occurs, kinetic energy is translated into heat, the log degrades, gets scraped up, and other energy translaions occur. the distribution of damage to the log depends on the shape of the log. however, to a very small virus in a mite on the head of a termite in the center of the log, the shock waves from the collissions would appear uniformly random in direction. this is my theory for grb. they are evidence of our universe interacting with other universes! why not! makes just as much sense as the grb coming from the oort cloud! the log theory of universes can't be ruled out! of course, i'm a layman in the physics world. you physicists out there, tell me about this !!!!

Now use a vectorizer that skips stopwords


In [10]:
vec_stop = TfidfVectorizer(stop_words='english')
clf_stop = LogisticRegressionCV()
pipeline_stop = Pipeline([('vec', vec_stop), ('clf', clf_stop)])
pipeline_stop.fit(train['data'], train['target'])


Out[10]:
Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

Words such as "the", "in", "of" are not used as features and are not highlighted


In [11]:
show_html_expl(explain_prediction(clf_stop, test['data'][4], vec_stop, target_names=train['target_names']), force_weights=False)


Explained as: linear model

y=alt.atheism (probability 0.000, score -7.794) top features

Contribution? Feature
-1.395 <BIAS>
-6.399 Highlighted in text (sum)

i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?

y=comp.graphics (probability 0.999, score 5.992) top features

Contribution? Feature
+7.011 Highlighted in text (sum)
-1.018 <BIAS>

i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?

y=sci.space (probability 0.000, score -7.692) top features

Contribution? Feature
-1.017 <BIAS>
-6.675 Highlighted in text (sum)

i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?

y=talk.religion.misc (probability 0.000, score -10.365) top features

Contribution? Feature
-1.070 <BIAS>
-9.294 Highlighted in text (sum)

i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?