In [108]:
import logging
import numpy as np
import pandas as pd
import re
from optparse import OptionParser
import sys
from time import time
import codecs
import matplotlib as mpl
from matplotlib.cm import get_cmap
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
In [3]:
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
remove = ('headers', 'footers', 'quotes')
In [4]:
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42,
remove=remove)
In [6]:
data_train.data[0]
Out[6]:
In [128]:
text_dict = {k:v for k, v in enumerate(data_train.data)}
doc_cat = {k:v for k, v in enumerate(data_train.target)}
In [133]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
In [109]:
# hasher = HashingVectorizer(stop_words='english', alternate_sign=False, norm=None, binary=False)
# vectorizer = make_pipeline(hasher, TfidfTransformer())
In [134]:
X_train = vectorizer.fit_transform(data_train.data)
print("n_samples: %d, n_features: %d" % X_train.shape)
In [135]:
feature_names = vectorizer.get_feature_names()
print(len(feature_names))
In [30]:
pd.DataFrame(X_train[0].toarray())
Out[30]:
In [161]:
X_train
Out[161]:
In [31]:
def top_tfidf_feats(row, features, top_n=25):
''' Get top n tfidf values in row and return them with their corresponding feature names.'''
topn_ids = np.argsort(row)[::-1][:top_n]
top_feats = [(features[i], row[i]) for i in topn_ids]
df = pd.DataFrame(top_feats)
df.columns = ['feature', 'tfidf']
return df
In [33]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
''' Top tfidf features in specific document (matrix row) '''
row = np.squeeze(Xtr[row_id].toarray())
return top_tfidf_feats(row, features, top_n)
In [37]:
print(categories[doc_cat.get(1)])
print(top_feats_in_doc(X_train, feature_names, 1))
In [42]:
feature_tfidf_wts = top_feats_in_doc(X_train, feature_names, 1)
print(type(feature_tfidf_wts))
In [69]:
scores = feature_tfidf_wts.set_index('feature').to_dict()['tfidf']
In [121]:
type(data_train.data[1])
print(data_train.target.shape)
In [129]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
''' Return the top n features that on average are most important amongst documents in rows
indentified by indices in grp_ids. '''
if grp_ids:
D = Xtr[grp_ids].toarray()
else:
D = Xtr.toarray()
D[D < min_tfidf] = 0
tfidf_means = np.mean(D, axis=0)
return top_tfidf_feats(tfidf_means, features, top_n)
In [130]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
calculated across documents with the same class label. '''
dfs = []
labels = np.unique(y)
for label in labels:
ids = np.where(y==label)
feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
feats_df.label = label
dfs.append(feats_df)
return dfs
In [142]:
result = top_feats_by_class(X_train, data_train.target, features=feature_names)
In [83]:
def preprocess_text(text, to_lower=True, norm_num=False):
# remove links (other html crap is assumed to be removed by bs)
text = re.sub(r"http(s)?://\S*", " ", text)
if to_lower:
text = text.lower()
if norm_num:
text = re.sub(r"[0-9]", "1", text) # normalize numbers
# clean out non-alphabet characters and normalize whitespace
text = re.sub(r"[^A-Za-z0-9-]+", " ", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
In [99]:
def scores2html(text, scores, fname='testfile', metainf='', pos_clr_name='Blues',
neg_clr_name='Reds', highlight_oov=False):
"""
Reference: http://matplotlib.org/examples/color/colormaps_reference.html
Based on the original text and relevance scores, generate a html doc highlighting positive / negative words
Inputs:
- text: the raw text in which the words should be highlighted
- scores: a dictionary with {word: score} or a list with tuples [(word, score)]
- fname: the name (path) of the file
- metainf: an optional string which will be added at the top of the file (e.g. true class of the document)
- highlight_oov: if True, out-of-vocabulary words will be highlighted in yellow (default False)
Saves the visualization in 'fname.html' (you probably want to make this a whole path to not clutter your main directory...)
"""
# colormaps
cmap_pos = get_cmap(pos_clr_name)
cmap_neg = get_cmap(neg_clr_name)
norm = mpl.colors.Normalize(0., 1.)
# if not isinstance(text, unicode):
# text = text.decode("utf-8")
# normalize score by absolute max value
if isinstance(scores, dict):
N = np.max(np.abs(list(scores.values())))
scores_dict = {word: scores[word] / N for word in scores}
# transform dict into word list with scores
scores = []
for word in re.findall(r'[\w-]+', text, re.UNICODE):
word_pp = preprocess_text(word)
if word_pp in scores_dict:
scores.append((word, scores_dict[word_pp]))
else:
scores.append((word, None))
else:
N = np.max(np.abs([t[1] for t in scores if t[1] is not None]))
scores = [(w, s / N) if s is not None else (w, None) for w, s in scores]
htmlstr = u'<body><div style="white-space: pre-wrap; font-family: monospace;">'
if metainf:
htmlstr += '%s\n\n' % metainf
resttext = text
for word, score in scores:
# was anything before the identified word? add it unchanged to the html
htmlstr += resttext[:resttext.find(word)]
# cut off the identified word
resttext = resttext[resttext.find(word) + len(word):]
# get the colorcode of the word
rgbac = (1., 1., 0.) # for unknown words
if highlight_oov:
alpha = 0.3
else:
alpha = 0.
if score is not None:
if score < 0:
rgbac = cmap_neg(norm(-score))
else:
rgbac = cmap_pos(norm(score))
alpha = 0.5
htmlstr += u'<span style="background-color: rgba(%i, %i, %i, %.1f)">%s</span>'\
% (round(255 * rgbac[0]), round(255 * rgbac[1]), round(255 * rgbac[2]), alpha, word)
# after the last word, add the rest of the text
htmlstr += resttext
htmlstr += u'</div></body>'
with codecs.open('%s.html' % fname, 'w', encoding='utf8') as f:
f.write(htmlstr)
In [100]:
scores2html(data_train.data[1], scores, highlight_oov=True)
In [101]:
print(data_train.data[1])
In [116]:
def show_in_notebook():
from IPython.core.display import display, HTML
return HTML('./testfile.html')
show_in_notebook()
Out[116]:
In [145]:
## Build a classifier and see if you can visualize feature contribution ...
In [147]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")
In [148]:
y = data_train.target
clf.fit(X_train, y)
Out[148]:
In [163]:
np.squeeze(clf.coef_[0])
Out[163]:
In [155]:
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42,
remove=remove)
print('data loaded')
In [171]:
print(len(feature_names))
clf.predict(X_train[0])
Out[171]:
In [180]:
temp = [(a,b) for a, b in zip(feature_names, np.squeeze(clf.coef_[0]))]
clf_wts_df = pd.DataFrame(temp)
clf_wts_df.columns = ['feature', 'wts']
clf_dict = clf_wts_df.set_index('feature').to_dict()['wts']
In [182]:
type(clf_dict)
Out[182]:
In [184]:
scores2html(data_train.data[1], clf_dict, highlight_oov=True)
show_in_notebook()
Out[184]:
In [ ]: