In [108]:
import logging
import numpy as np
import pandas as pd
import re
from optparse import OptionParser
import sys
from time import time
import codecs
import matplotlib as mpl
from matplotlib.cm import get_cmap
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
remove = ('headers', 'footers', 'quotes')

In [4]:
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

In [6]:
data_train.data[0]


Out[6]:
"Hi,\n\nI've noticed that if you only save a model (with all your mapping planes\npositioned carefully) to a .3DS file that when you reload it after restarting\n3DS, they are given a default position and orientation.  But if you save\nto a .PRJ file their positions/orientation are preserved.  Does anyone\nknow why this information is not stored in the .3DS file?  Nothing is\nexplicitly said in the manual about saving texture rules in the .PRJ file. \nI'd like to be able to read the texture rule information, does anyone have \nthe format for the .PRJ file?\n\nIs the .CEL file format available from somewhere?\n\nRych"

In [128]:
text_dict = {k:v for k, v in enumerate(data_train.data)}
doc_cat = {k:v for k, v in enumerate(data_train.target)}

In [133]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')

In [109]:
# hasher = HashingVectorizer(stop_words='english', alternate_sign=False, norm=None, binary=False)
# vectorizer = make_pipeline(hasher, TfidfTransformer())

In [134]:
X_train = vectorizer.fit_transform(data_train.data)
print("n_samples: %d, n_features: %d" % X_train.shape)


n_samples: 2034, n_features: 26576

In [135]:
feature_names = vectorizer.get_feature_names() 
print(len(feature_names))


26576

In [30]:
pd.DataFrame(X_train[0].toarray())


Out[30]:
0 1 2 3 4 5 6 7 8 9 ... 26566 26567 26568 26569 26570 26571 26572 26573 26574 26575
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1 rows × 26576 columns


In [161]:
X_train


Out[161]:
<2034x26576 sparse matrix of type '<class 'numpy.float64'>'
	with 133634 stored elements in Compressed Sparse Row format>

In [31]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [33]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [37]:
print(categories[doc_cat.get(1)])
print(top_feats_in_doc(X_train, feature_names, 1))


sci.space
          feature     tfidf
0          koresh  0.260513
1      delusional  0.246107
2        deranged  0.246107
3      fruitcakes  0.246107
4         barring  0.246107
5           mania  0.246107
6           circa  0.233515
7      neccessary  0.233515
8         fanatic  0.224582
9         satisfy  0.211990
10          jones  0.211990
11  demonstrating  0.207203
12     corruption  0.207203
13           nope  0.193167
14      centuries  0.179649
15       contrary  0.171642
16          bunch  0.171642
17          folks  0.163063
18           evil  0.153863
19            jim  0.152310
20       children  0.143188
21           1993  0.132338
22       evidence  0.131166
23         simply  0.129669
24        thought  0.117693

In [42]:
feature_tfidf_wts = top_feats_in_doc(X_train, feature_names, 1)
print(type(feature_tfidf_wts))


<class 'pandas.core.frame.DataFrame'>

In [69]:
scores = feature_tfidf_wts.set_index('feature').to_dict()['tfidf']

In [121]:
type(data_train.data[1])
print(data_train.target.shape)


(2034,)

In [129]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [130]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [142]:
result = top_feats_by_class(X_train, data_train.target, features=feature_names)

In [83]:
def preprocess_text(text, to_lower=True, norm_num=False):
    # remove links (other html crap is assumed to be removed by bs)
    text = re.sub(r"http(s)?://\S*", " ", text)  
    if to_lower:
        text = text.lower()
    if norm_num:
        text = re.sub(r"[0-9]", "1", text)  # normalize numbers
    # clean out non-alphabet characters and normalize whitespace
    text = re.sub(r"[^A-Za-z0-9-]+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [99]:
def scores2html(text, scores, fname='testfile', metainf='', pos_clr_name='Blues', 
                neg_clr_name='Reds', highlight_oov=False):
    """
    Reference: http://matplotlib.org/examples/color/colormaps_reference.html
    Based on the original text and relevance scores, generate a html doc highlighting positive / negative words
    Inputs:
        - text: the raw text in which the words should be highlighted
        - scores: a dictionary with {word: score} or a list with tuples [(word, score)]
        - fname: the name (path) of the file
        - metainf: an optional string which will be added at the top of the file (e.g. true class of the document)
        - highlight_oov: if True, out-of-vocabulary words will be highlighted in yellow (default False)
    Saves the visualization in 'fname.html' (you probably want to make this a whole path to not clutter your main directory...)
    """
    # colormaps
    cmap_pos = get_cmap(pos_clr_name)
    cmap_neg = get_cmap(neg_clr_name)
    norm = mpl.colors.Normalize(0., 1.)

#     if not isinstance(text, unicode):
#         text = text.decode("utf-8")

    # normalize score by absolute max value
    if isinstance(scores, dict):
        N = np.max(np.abs(list(scores.values())))
        scores_dict = {word: scores[word] / N for word in scores}
        # transform dict into word list with scores
        scores = []
        for word in re.findall(r'[\w-]+', text, re.UNICODE):
            word_pp = preprocess_text(word)
            if word_pp in scores_dict:
                scores.append((word, scores_dict[word_pp]))
            else:
                scores.append((word, None))
    else:
        N = np.max(np.abs([t[1] for t in scores if t[1] is not None]))
        scores = [(w, s / N) if s is not None else (w, None) for w, s in scores]

    htmlstr = u'<body><div style="white-space: pre-wrap; font-family: monospace;">'
    if metainf:
        htmlstr += '%s\n\n' % metainf
    resttext = text
    for word, score in scores:
        # was anything before the identified word? add it unchanged to the html
        htmlstr += resttext[:resttext.find(word)]
        # cut off the identified word
        resttext = resttext[resttext.find(word) + len(word):]
        # get the colorcode of the word
        rgbac = (1., 1., 0.)  # for unknown words
        if highlight_oov:
            alpha = 0.3
        else:
            alpha = 0.
        if score is not None:
            if score < 0:
                rgbac = cmap_neg(norm(-score))
            else:
                rgbac = cmap_pos(norm(score))
            alpha = 0.5
        htmlstr += u'<span style="background-color: rgba(%i, %i, %i, %.1f)">%s</span>'\
            % (round(255 * rgbac[0]), round(255 * rgbac[1]), round(255 * rgbac[2]), alpha, word)
    # after the last word, add the rest of the text
    htmlstr += resttext
    htmlstr += u'</div></body>'
    with codecs.open('%s.html' % fname, 'w', encoding='utf8') as f:
        f.write(htmlstr)

In [100]:
scores2html(data_train.data[1], scores, highlight_oov=True)

In [101]:
print(data_train.data[1])



Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries.

In [116]:
def show_in_notebook():
    from IPython.core.display import display, HTML
    return HTML('./testfile.html')
show_in_notebook()


Out[116]:
Seems to be, barring evidence to the contrary, that Koresh was simply another deranged fanatic who thought it neccessary to take a whole bunch of folks with him, children and all, to satisfy his delusional mania. Jim Jones, circa 1993. Nope - fruitcakes like Koresh have been demonstrating such evil corruption for centuries.

In [145]:
## Build a classifier and see if you can visualize feature contribution ...

In [147]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")

In [148]:
y = data_train.target
clf.fit(X_train, y)


/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:117: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.
  DeprecationWarning)
Out[148]:
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=50,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [163]:
np.squeeze(clf.coef_[0])


Out[163]:
array([-0.24152341,  0.        ,  0.        , ...,  0.        ,
        0.        ,  0.        ])

In [155]:
data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)
print('data loaded')


data loaded

In [171]:
print(len(feature_names))
clf.predict(X_train[0])


26576
Out[171]:
array([1])

In [180]:
temp = [(a,b) for a, b in zip(feature_names, np.squeeze(clf.coef_[0]))]
clf_wts_df = pd.DataFrame(temp)
clf_wts_df.columns = ['feature', 'wts']
clf_dict = clf_wts_df.set_index('feature').to_dict()['wts']

In [182]:
type(clf_dict)


Out[182]:
dict

In [184]:
scores2html(data_train.data[1], clf_dict, highlight_oov=True)
show_in_notebook()


Out[184]:
Seems to be, barring evidence to the contrary, that Koresh was simply another deranged fanatic who thought it neccessary to take a whole bunch of folks with him, children and all, to satisfy his delusional mania. Jim Jones, circa 1993. Nope - fruitcakes like Koresh have been demonstrating such evil corruption for centuries.

In [ ]: