In [10]:
import IPython
from IPython.display import HTML
import numpy as np
import pandas as pd
from scipy import sparse
from tsa.science import numpy_ext as npx
from collections import Counter
import viz
from sklearn import metrics, cross_validation
from sklearn import linear_model, svm, naive_bayes
from sklearn import feature_selection
from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra, cache
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary
In [33]:
import tsa.science.models
reload(tsa.science.models)
from tsa.science.models import Bootstrap, SelectKBest
In [4]:
full_corpus = MulticlassCorpus(Source.from_name('sb5b', labeled_only=True))
full_corpus.apply_labelfunc(lambda doc: doc.label)
full_corpus.extract_features(lambda doc: 1, features.intercept)
full_corpus.extract_features(lambda doc: doc.document, features.ngrams,
ngram_max=2, min_df=2, max_df=1.0)
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(rows=polar_indices)
In [5]:
def extreme_features(feature_names, feature_values, margin=10):
# Looking at the extremes, based on feature_values
# sort descending
ordering = np.argsort(-feature_values)
ordered_feature_names = feature_names[ordering]
ordered_feature_values = feature_values[ordering]
n_features = feature_names.size
slices = [
('Largest', 0, margin),
('Middle', (n_features // 2) - (margin // 4), margin // 2),
('Smallest', n_features - margin, margin),
]
for name, start, length in slices:
yield name, ordered_feature_values[start:start + length], ordered_feature_names[start:start + length]
In [39]:
models = [
('Anova', SelectKBest(score_func=feature_selection.f_classif, k='all')),
('Bootstrap', Bootstrap(linear_model.LogisticRegression, n_iter=1000, proportion=1.0,
fit_intercept=False, penalty='l2', C=1.0)),
('Logistic Regression (L2)', linear_model.LogisticRegression(penalty='l2',
fit_intercept=False)),
('Logistic Regression (L2) (C=100)', linear_model.LogisticRegression(penalty='l2', C=100.0,
fit_intercept=False)),
('Logistic Regression (L1)', linear_model.LogisticRegression(penalty='l1',
fit_intercept=False)),
# ('randomized_logistic_regression', linear_model.RandomizedLogisticRegression()),
('Perceptron (L2)', linear_model.Perceptron(penalty='l2', fit_intercept=False)),
('Perceptron (L1)', linear_model.Perceptron(penalty='l1', fit_intercept=False)),
('Linear SVC (L2)', svm.LinearSVC(penalty='l2', fit_intercept=False)),
('Linear SVC (L1)', svm.LinearSVC(penalty='l1', dual=False, fit_intercept=False)),
('Naive Bayes', naive_bayes.MultinomialNB()),
]
models = models[:1]
In [40]:
# pred_y = model.predict(test_corpus.X)
# ordering = np.argsort(-np.abs(coefs))
extremes = dict(Largest=pd.DataFrame(), Middle=pd.DataFrame(), Smallest=pd.DataFrame())
for model_name, model in models:
model.fit(polar_corpus.X, polar_corpus.y)
print model_name, 'coef_.shape:', model.coef_.shape
for key, values, names in extreme_features(polar_corpus.feature_names, model.coef_.ravel(), margin=10):
extremes[key][model_name] = values
extremes[key][model_name + '-names'] = names
# printer = tabular.Printer(FS=' & ', RS='\\\\\n')
# printer.write(row_dict)
In [41]:
extremes_df = pd.concat(extremes, axis=0)
print extremes_df.to_latex(float_format=lambda x: '%5.2f' % x)
In [46]:
# model = Bootstrap(linear_model.LogisticRegression, n_iter=1000, proportion=1.0, fit_intercept=False, penalty='l2', C=1.0)
# model.fit(polar_corpus.X, polar_corpus.y)
model_name = 'Bootstrap-Variance'
extremes = dict(Largest=pd.DataFrame(), Middle=pd.DataFrame(), Smallest=pd.DataFrame())
feature_values = np.var(model.coefs_, axis=0)
for key, values, names in extreme_features(polar_corpus.feature_names, feature_values, margin=10):
extremes[key][model_name] = values
extremes[key][model_name + '-names'] = names
extremes_df = pd.concat(extremes, axis=0)
print extremes_df.to_latex(float_format=lambda x: '%5.2f' % x)
In [19]:
print extremes_df.to_latex(float_format=lambda x: '%5.2f' % x)
Out[19]:
In [ ]:
for k in [10, 20, 30, 40, 50, 100, 200, 250, 500, 750, 1000, 5000, 10000]:
subvocab_corpus = corpus_top_k_features_subset(polar_corpus, polar_model, k)
accuracy = corpus_mean_accuracy(subvocab_corpus, penalty=regularization, n_iter=10)
In [37]:
k_best_model = SelectKBest(score_func=feature_selection.f_classif, k=10)
k_best_model.fit(polar_corpus.X, polar_corpus.y)
plt.hist(k_best_model.coef_)
Out[37]:
In [26]:
f_values, p_values = feature_selection.f_classif(polar_corpus.X, polar_corpus.y)
f_values = f_values[~np.isnan(f_values)]
p_values = p_values[~np.isnan(p_values)]
In [42]:
_ = plt.hist(f_values, log=True, bins=50)
f_values.max()
Out[42]:
In [47]:
plt.hist(anova_f_value)
Looking more closely at coefficients...
In [49]:
model = linear_model.LogisticRegression(penalty='l2', fit_intercept=False)
model.fit(polar_corpus.X, polar_corpus.y)
Out[49]:
In [115]:
coefs = np.array(model.coef_).ravel()
print coefs.shape, polar_corpus.X.shape
# transforms =
pred_proba = model.predict_proba(polar_corpus.X)
In [119]:
Out[119]:
In [125]:
import viz
def gloss_datum(corpus, index, coefs):
document = corpus.data[index]
x = corpus.X[index].toarray().ravel()
#prob = pred_proba[index]
nonzero_features = x > 0
nonzero_feature_names = corpus.feature_names[nonzero_features]
projection = x * coefs
nonzero_values = projection[nonzero_features]
# reordering = np.argsort(x_coefs)
pairs = zip(nonzero_feature_names, ['%.2f' % x for x in nonzero_values])
fulltext = document.document.replace('\n', ' ')
return document.label, fulltext, pd.DataFrame.from_dict(
dict(tokens=np.concatenate((nonzero_feature_names, ['SUM'])),
values=np.concatenate((nonzero_values, [sum(projection)])))).T
In [ ]:
# print
# print '--- %s ---' % test_corpus.labels[test_corpus.y[index]]
# print '%s (%s)' % ( document.label)
# print dict(zip(corpus.labels[model.classes_], prob))
# print viz.gloss.gloss([('', 'means')] + pairs + [('SUM', )])
In [126]:
pred_proba, npx.hmean(np.array([[ 0.9999 , 0.0001 ]]), axis=1)
Out[126]:
In [144]:
values = npx.hmean(pred_proba, axis=1)
values = polar_corpus.X.dot(coefs)
ordering = np.argsort(values)
selected_indices = ordering[-20:]
for selected_index in selected_indices:
# print selected_index, values[selected_index]
label, text, table = gloss_datum(polar_corpus, selected_index, coefs)
print selected_index, label, text, values[selected_index]
In [145]:
selected_index = 9656
print int(selected_index)
label, text, table = gloss_datum(polar_corpus, selected_index, coefs)
print label, text
HTML(table.to_html(float_format=lambda x: '%.2f' % x))
Out[145]:
In [148]:
print table.T.to_latex(float_format=lambda x: '%.2f' % x)
In [96]:
nonzero_feature_names = polar_corpus.feature_names[1:9]
np.concatenate((nonzero_feature_names, ['hi']))
In [97]:
nonzero_feature_names
Out[97]:
In [101]:
Out[101]:
In [ ]: