In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
fetch_subset = lambda subset: fetch_20newsgroups(
subset=subset, categories=categories,
shuffle=True, random_state=42,
remove=('headers', 'footers', 'quotes'))
train = fetch_subset('train')
test = fetch_subset('test')
In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipeline = Pipeline([('vec', vec), ('clf', clf)])
pipeline.fit(train['data'], train['target'])
Out[2]:
Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...2', random_state=None,
refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])
In [12]:
import eli5
from eli5 import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles, fields
# print(format_as_text(explain_weights(clf, vec, target_names=train['target_names'])))
In [4]:
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())
In [5]:
eli5.show_weights(clf, vec=vec, target_names=train['target_names'], horizontal_layout=False)
Out[5]:
y=alt.atheism
top features
Weight?
Feature
+18.117
atheism
+16.558
atheists
+14.393
religion
+14.380
bobby
+14.325
matthew
+13.389
motto
+13.215
atheist
+13.010
islam
+12.800
nanci
+12.216
enviroleague
+12.109
loans
+11.672
satan
+11.488
posting
+11.173
enlightening
+11.108
natural
… 6382 more positive …
… 20478 more negative …
-11.259
fake
-11.526
order
-12.169
christian
-12.253
hudson
-18.551
space
y=comp.graphics
top features
Weight?
Feature
+25.897
graphics
+18.957
image
+17.298
computer
+16.843
3d
+16.190
file
+14.020
points
+13.269
sgi
+13.180
42
+12.428
hi
+11.835
3do
+11.175
animation
+11.146
using
+10.877
code
+10.792
package
+10.681
video
+10.585
screen
+10.571
sphere
+10.570
68070
+10.553
files
… 7893 more positive …
… 18967 more negative …
-18.127
space
y=sci.space
top features
Weight?
Feature
+35.983
space
+17.907
orbit
+15.269
nasa
+15.173
launch
+13.235
spacecraft
+12.872
mars
+12.369
nick
+12.117
moon
+12.064
allen
+11.800
shuttle
+11.799
dc
+10.934
sci
+10.726
solar
+10.716
earth
… 10083 more positive …
… 16777 more negative …
-10.976
file
-11.109
wrong
-11.886
image
-12.109
religion
-13.500
god
-18.002
graphics
y=talk.religion.misc
top features
Weight?
Feature
+19.215
christian
+16.667
blood
+14.907
fbi
+14.185
christians
+12.783
hudson
+12.746
order
+12.338
christ
+12.126
ekr
+11.972
terrorist
+11.608
koresh
+11.549
dead
+11.185
cult
… 6600 more positive …
… 20260 more negative …
-11.206
anyone
-11.567
could
-11.699
get
-12.212
thanks
-12.230
edu
-12.319
it
-13.026
atheists
-17.289
space
In [6]:
show_html_expl(
explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']),
force_weights=False, horizontal_layout=True)
Explained as: linear model
y=alt.atheism
(probability 0.000, score -16.171)
top features
Contribution?
Feature
-1.394
<BIAS>
-14.777
Highlighted in text (sum)
hi there,
i am here looking for some help.
my friend is a interior decor designer. he is from thailand. he is
trying to find some graphics software on pc. any suggestion on which
software to buy,where to buy and how much it costs ? he likes the most
sophisticated
software(the more features it has,the better)
y=comp.graphics
(probability 0.999, score 8.616)
top features
Contribution?
Feature
+9.631
Highlighted in text (sum)
-1.015
<BIAS>
hi there,
i am here looking for some help.
my friend is a interior decor designer. he is from thailand. he is
trying to find some graphics software on pc. any suggestion on which
software to buy,where to buy and how much it costs ? he likes the most
sophisticated
software(the more features it has,the better)
y=sci.space
(probability 0.001, score -6.824)
top features
Contribution?
Feature
-1.016
<BIAS>
-5.808
Highlighted in text (sum)
hi there,
i am here looking for some help.
my friend is a interior decor designer. he is from thailand. he is
trying to find some graphics software on pc. any suggestion on which
software to buy,where to buy and how much it costs ? he likes the most
sophisticated
software(the more features it has,the better)
y=talk.religion.misc
(probability 0.000, score -11.885)
top features
Contribution?
Feature
-1.019
<BIAS>
-10.865
Highlighted in text (sum)
hi there,
i am here looking for some help.
my friend is a interior decor designer. he is from thailand. he is
trying to find some graphics software on pc. any suggestion on which
software to buy,where to buy and how much it costs ? he likes the most
sophisticated
software(the more features it has,the better)
dense_multitarget=True is supported for prediction explanations too, and shows just the top prediction highlighting.
In [7]:
show_html_expl(explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']),
force_weights=True)
Explained as: linear model
y=alt.atheism
(probability 0.000, score -16.171)
top features
y=comp.graphics
(probability 0.999, score 8.616)
top features
y=sci.space
(probability 0.001, score -6.824)
top features
y=talk.religion.misc
(probability 0.000, score -11.885)
top features
Contribution?
Feature
+0.889
some
+0.539
much
+0.278
is
+0.266
which
+0.225
designer
+0.218
it
+0.161
most
+0.107
trying
-0.005
interior
-0.009
has
-0.053
likes
-0.058
sophisticated
-0.060
any
-0.111
my
-0.171
find
-0.216
there
-0.244
for
-0.276
more
-0.317
suggestion
-0.371
here
-0.385
to
-0.390
and
-0.397
better
-0.400
he
-0.407
how
-0.432
am
-0.462
from
-0.509
features
-0.606
where
-0.615
pc
-0.649
hi
-0.664
on
-0.698
the
-0.755
friend
-0.800
help
-0.854
costs
-0.892
looking
-0.931
buy
-1.269
graphics
-1.394
<BIAS>
-3.451
software
Contribution?
Feature
+3.120
graphics
+2.661
software
+1.707
hi
+1.180
looking
+1.127
buy
+0.906
features
+0.850
pc
+0.672
help
+0.530
any
+0.520
it
+0.474
on
+0.459
find
+0.347
am
+0.346
where
+0.314
has
+0.311
there
+0.233
for
+0.225
which
+0.078
from
+0.046
friend
+0.034
trying
+0.028
interior
+0.005
costs
-0.018
likes
-0.019
better
-0.026
my
-0.058
here
-0.103
designer
-0.109
sophisticated
-0.155
and
-0.172
is
-0.217
some
-0.244
how
-0.251
to
-0.254
most
-0.370
more
-0.534
much
-0.760
suggestion
-0.863
the
-1.015
<BIAS>
-2.388
he
Contribution?
Feature
+0.870
costs
+0.637
buy
+0.606
software
+0.500
most
+0.393
the
+0.287
on
+0.281
some
+0.259
better
+0.249
likes
+0.241
sophisticated
+0.213
more
+0.183
much
+0.176
friend
+0.122
from
+0.118
there
+0.049
here
+0.028
and
+0.018
to
+0.016
how
-0.013
interior
-0.041
designer
-0.051
where
-0.090
it
-0.094
has
-0.203
suggestion
-0.234
for
-0.295
help
-0.316
am
-0.327
trying
-0.356
any
-0.418
which
-0.506
pc
-0.519
find
-0.529
my
-0.567
looking
-0.654
features
-0.961
hi
-1.016
<BIAS>
-1.140
is
-1.568
he
-2.169
graphics
Contribution?
Feature
+2.181
he
+0.528
my
+0.481
more
+0.345
and
+0.313
friend
+0.287
suggestion
+0.228
trying
+0.137
find
+0.110
here
+0.104
from
-0.001
interior
-0.028
designer
-0.032
likes
-0.044
how
-0.052
am
-0.057
help
-0.139
sophisticated
-0.140
better
-0.202
to
-0.204
the
-0.218
where
-0.231
which
-0.236
features
-0.278
buy
-0.394
there
-0.399
costs
-0.473
pc
-0.500
much
-0.514
looking
-0.556
for
-0.621
is
-0.692
any
-0.699
has
-0.732
most
-1.006
on
-1.019
<BIAS>
-1.098
hi
-1.212
some
-1.218
it
-1.295
graphics
-2.308
software
y=alt.atheism
(probability 0.000, score -16.171)
top features
Contribution?
Feature
-1.394
<BIAS>
-14.777
Highlighted in text (sum)
hi there,
i am here looking for some help.
my friend is a interior decor designer. he is from thailand. he is
trying to find some graphics software on pc. any suggestion on which
software to buy,where to buy and how much it costs ? he likes the most
sophisticated
software(the more features it has,the better)
y=comp.graphics
(probability 0.999, score 8.616)
top features
Contribution?
Feature
+9.631
Highlighted in text (sum)
-1.015
<BIAS>
hi there,
i am here looking for some help.
my friend is a interior decor designer. he is from thailand. he is
trying to find some graphics software on pc. any suggestion on which
software to buy,where to buy and how much it costs ? he likes the most
sophisticated
software(the more features it has,the better)
y=sci.space
(probability 0.001, score -6.824)
top features
Contribution?
Feature
-1.016
<BIAS>
-5.808
Highlighted in text (sum)
hi there,
i am here looking for some help.
my friend is a interior decor designer. he is from thailand. he is
trying to find some graphics software on pc. any suggestion on which
software to buy,where to buy and how much it costs ? he likes the most
sophisticated
software(the more features it has,the better)
y=talk.religion.misc
(probability 0.000, score -11.885)
top features
Contribution?
Feature
-1.019
<BIAS>
-10.865
Highlighted in text (sum)
hi there,
i am here looking for some help.
my friend is a interior decor designer. he is from thailand. he is
trying to find some graphics software on pc. any suggestion on which
software to buy,where to buy and how much it costs ? he likes the most
sophisticated
software(the more features it has,the better)
We can hide weights by passing force_weights=False (they still will be shown if it's impossible to highlight text)
In [8]:
show_html_expl(explain_prediction(clf, test['data'][4], vec, target_names=train['target_names']), force_weights=False)
Explained as: linear model
y=alt.atheism
(probability 0.001, score -7.516)
top features
Contribution?
Feature
-1.394
<BIAS>
-6.122
Highlighted in text (sum)
i am interested in finding 3d animation programs for the mac.
i am especially interested in any programs that don't exist
in a pc port and are so good that they would make me go buy
a mac. do any such exist?
y=comp.graphics
(probability 0.999, score 6.432)
top features
Contribution?
Feature
+7.447
Highlighted in text (sum)
-1.015
<BIAS>
i am interested in finding 3d animation programs for the mac.
i am especially interested in any programs that don't exist
in a pc port and are so good that they would make me go buy
a mac. do any such exist?
y=sci.space
(probability 0.000, score -10.113)
top features
Contribution?
Feature
-1.016
<BIAS>
-9.098
Highlighted in text (sum)
i am interested in finding 3d animation programs for the mac.
i am especially interested in any programs that don't exist
in a pc port and are so good that they would make me go buy
a mac. do any such exist?
y=talk.religion.misc
(probability 0.000, score -11.681)
top features
Contribution?
Feature
-1.019
<BIAS>
-10.662
Highlighted in text (sum)
i am interested in finding 3d animation programs for the mac.
i am especially interested in any programs that don't exist
in a pc port and are so good that they would make me go buy
a mac. do any such exist?
Show explanations for the winning class for first 10 documents from test data
In [9]:
import numpy as np
for doc in test['data'][:10]:
expl = explain_prediction(clf, doc, vec, target_names=train['target_names'], top_targets=1)
show_html_expl(expl, force_weights=False)
Explained as: linear model
y=sci.space
(probability 0.979, score 5.057)
top features
Contribution?
Feature
+6.073
Highlighted in text (sum)
-1.016
<BIAS>
trry the skywatch project in arizona.
Explained as: linear model
y=comp.graphics
(probability 0.999, score 6.193)
top features
Contribution?
Feature
+7.208
Highlighted in text (sum)
-1.015
<BIAS>
the vatican library recently made a tour of the us.
can anyone help me in finding a ftp site where this collection is
available.
Explained as: linear model
y=comp.graphics
(probability 0.999, score 8.616)
top features
Contribution?
Feature
+9.631
Highlighted in text (sum)
-1.015
<BIAS>
hi there,
i am here looking for some help.
my friend is a interior decor designer. he is from thailand. he is
trying to find some graphics software on pc. any suggestion on which
software to buy,where to buy and how much it costs ? he likes the most
sophisticated
software(the more features it has,the better)
Explained as: linear model
y=comp.graphics
(probability 0.994, score 3.280)
top features
Contribution?
Feature
+4.294
Highlighted in text (sum)
-1.015
<BIAS>
rfd
request for discussion
for the
open telematic group
otg
i have proposed the forming of a consortium/task force for the
promotion of naplps/jpeg, fif to openly discuss ways, method,
procedures,algorythms, applications, implementation, extensions of
naplps/jpeg standards. these standards should facilitate the creation
of real_time online applications that make use of voice, video,
telecommuting, hires graphics, conferencing, distant learning, online
order entry, fax,in addition these dicussion would assist all to
better understand how sgml, cals, oda, mime, oodbms, jpeg, mpeg,
fractals, sql, cdrom, cdromxa, kodak photocd, tcl, v.fast, and
eia/tia562, can best be incorporated and implemented to develop
telematic/multimedia applications.
we want to be able to support dos, unix, mac, windows, nt, os/2
platforms. it is our hope that individuals, developers, corporations,
universities, r & d labs would join in in supporting such an endeavor.
this would be a not_for_profit group with bylaws and charter. already
many corporations have decided to support otg (open telematic group) so
do not delay joining if you are a developer
an rfd has been posted to form a usenet newsgroup and a faq will soon
be be composed to start promulgating what is known on the subject. if
you would like to be added to the maillist send email or mail to the
address below.
this group would publish an electronic quarterly naplps/jpeg
newsletter as well as a hardcopy version. we urge all who wants to
see cmcs hires based applications & the naplps/jpeg g r o w, decide to
join and mutually benefit from this not-for_profit endeavor.
note: telematic has been defined by mr. james martin as the marriage
of voice, video, hi-res graphics, fax, ivr, music over telephone
lines/lan.
if you would like to get involve write to me at:
img inter-multimedia group| internet: epimntl@world.std.com
p.o. box 95901 | ed.pimentel@gisatl.fidonet.org
atlanta, georgia, us | cis : 70611,3703
| fidonet : 1:133/407
| bbs : +1-404-985-1198 zyxel 14.4k
Explained as: linear model
y=comp.graphics
(probability 0.999, score 6.432)
top features
Contribution?
Feature
+7.447
Highlighted in text (sum)
-1.015
<BIAS>
i am interested in finding 3d animation programs for the mac.
i am especially interested in any programs that don't exist
in a pc port and are so good that they would make me go buy
a mac. do any such exist?
Explained as: linear model
y=comp.graphics
(probability 0.643, score 0.494)
top features
Contribution?
Feature
+1.509
Highlighted in text (sum)
-1.015
<BIAS>
i'm also interested in such a program. but most of all i'd like to know
wich program is able to convert gif or pcx to dxf !!! when i have this
program, i can scan pictures and frase (or something like that !) them.
this will be beyond the limit !!!
Explained as: linear model
y=comp.graphics
(probability 0.412, score -0.996)
top features
Contribution?
Feature
+0.019
Highlighted in text (sum)
-1.015
<BIAS>
or how about:
"end light pollution now!!"
your banner would have no effect on its subject, but my banner would.
Explained as: linear model
y=sci.space
(probability 1.000, score 10.393)
top features
Contribution?
Feature
+11.409
Highlighted in text (sum)
-1.016
<BIAS>
: while i'm sure sagan considers it sacrilegious, that wouldn't be
: because of his doubtfull credibility as an astronomer. modern,
: ground-based, visible light astronomy (what these proposed
: orbiting billboards would upset) is already a dying field: the
: opacity and distortions caused by the atmosphere itself have
: driven most of the field to use radio, far infrared or space-based
: telescopes.
hardly. the keck telescope in hawaii has taken its first pictures; they're
nearly as good as hubble for a tiny fraction of the cost.
: in any case, a bright point of light passing through
: the field doesn't ruin observations. if that were the case, the
: thousands of existing satellites would have already done so (satelliets
: might not seem so bright to the eyes, but as far as astronomy is concerned,
: they are extremely bright.)
i believe that this orbiting space junk will be far brighter still;
more like the full moon. the moon upsets deep-sky observation all
over the sky (and not just looking at it) because of scattered light.
this is a known problem, but of course two weeks out of every four are
ok. what happens when this billboard circles every 90 minutes? what
would be a good time then?
: frank crary
: cu boulder
Explained as: linear model
y=alt.atheism
(probability 0.991, score 8.925)
top features
Contribution?
Feature
+10.319
Highlighted in text (sum)
-1.394
<BIAS>
not if you show that these hypothetical atheists are gullible, excitable
and easily led from some concrete cause. in that case we would also
have to discuss if that concrete cause, rather than atheism, was the
factor that caused their subsequent behaviour.
Explained as: linear model
y=sci.space
(probability 0.850, score -0.580)
top features
Contribution?
Feature
+0.436
Highlighted in text (sum)
-1.016
<BIAS>
picture our universe floating like a log
in a river. as the log floats down the
river, it occasionally strikes rocks, the
bank, the bottom, other logs. when this collission
occurs, kinetic energy is translated into heat, the
log degrades, gets scraped up, and other energy
translaions occur. the distribution of damage to
the log depends on the shape of the log.
however, to a very small virus in a mite on the head of a
termite in the center of the log, the shock waves from the
collissions would appear uniformly random in direction.
this is my theory for grb. they are evidence of our universe
interacting with other universes! why not! makes
just as much sense as the grb coming from the oort cloud!
the log theory of universes can't be ruled out!
of course, i'm a layman in the physics world. you
physicists out there, tell me about this !!!!
Now use a vectorizer that skips stopwords
In [10]:
vec_stop = TfidfVectorizer(stop_words='english')
clf_stop = LogisticRegressionCV()
pipeline_stop = Pipeline([('vec', vec_stop), ('clf', clf_stop)])
pipeline_stop.fit(train['data'], train['target'])
Out[10]:
Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...2', random_state=None,
refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])
Words such as "the", "in", "of" are not used as features and are not highlighted
In [11]:
show_html_expl(explain_prediction(clf_stop, test['data'][4], vec_stop, target_names=train['target_names']), force_weights=False)
Explained as: linear model
y=alt.atheism
(probability 0.000, score -7.794)
top features
Contribution?
Feature
-1.395
<BIAS>
-6.399
Highlighted in text (sum)
i am interested in finding 3d animation programs for the mac.
i am especially interested in any programs that don't exist
in a pc port and are so good that they would make me go buy
a mac. do any such exist?
y=comp.graphics
(probability 0.999, score 5.992)
top features
Contribution?
Feature
+7.011
Highlighted in text (sum)
-1.018
<BIAS>
i am interested in finding 3d animation programs for the mac.
i am especially interested in any programs that don't exist
in a pc port and are so good that they would make me go buy
a mac. do any such exist?
y=sci.space
(probability 0.000, score -7.692)
top features
Contribution?
Feature
-1.017
<BIAS>
-6.675
Highlighted in text (sum)
i am interested in finding 3d animation programs for the mac.
i am especially interested in any programs that don't exist
in a pc port and are so good that they would make me go buy
a mac. do any such exist?
y=talk.religion.misc
(probability 0.000, score -10.365)
top features
Contribution?
Feature
-1.070
<BIAS>
-9.294
Highlighted in text (sum)
i am interested in finding 3d animation programs for the mac.
i am especially interested in any programs that don't exist
in a pc port and are so good that they would make me go buy
a mac. do any such exist?
Content source: TeamHG-Memex/eli5
Similar notebooks: