In [1]:
import IPython
from IPython.display import HTML
import numpy as np
import pandas as pd
from scipy import sparse
from tsa.science import numpy_ext as npx
from collections import Counter
import viz
from sklearn import metrics, cross_validation
from sklearn import linear_model, svm, naive_bayes
from sklearn import feature_selection
from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra, cache
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary
import tsa.science.models
from tsa.science.models import Bootstrap, SelectKBest
In [26]:
full_corpus = MulticlassCorpus(Source.from_name('sb5b', labeled_only=True))
full_corpus.apply_labelfunc(lambda doc: doc.label)
full_corpus.extract_features(lambda doc: 1, features.intercept)
full_corpus.extract_features(lambda doc: doc.document, features.ngrams,
ngram_max=2, min_df=2, max_df=1.0)
Out[26]:
In [27]:
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(rows=polar_indices)
In [28]:
balanced_indices = npx.balance(polar_corpus.y == full_corpus.class_lookup['For'],
polar_corpus.y == full_corpus.class_lookup['Against'])
balanced_corpus = polar_corpus.subset(rows=balanced_indices)
We want to see how much we can improve accuracy if we let ourselves throw out some of the data. So, we take only the data with confidence above a certain threshold, and evaluate accuracy on that subset.
In [33]:
bootstrap_model = Bootstrap(linear_model.LogisticRegression, n_iter=100, proportion=1.0,
fit_intercept=False, penalty='l2', C=1.0)
logreg_model = linear_model.LogisticRegression(fit_intercept=False, penalty='l2')
In [43]:
corpus = polar_corpus
In [44]:
folds = cross_validation.StratifiedShuffleSplit(corpus.y, test_size=0.1, n_iter=10)
folds = list(folds)
In [47]:
for train_indices, test_indices in folds:
train_corpus = corpus.subset(train_indices)
test_corpus = corpus.subset(test_indices)
logreg_model.fit(train_corpus.X, train_corpus.y)
pred_proba = logreg_model.predict_proba(test_corpus.X)
pred_y = logreg_model.classes_[np.argmax(pred_proba, axis=1)]
print 'accuracy={:%} on all {:d} examples'.format(
metrics.accuracy_score(test_corpus.y, pred_y),
len(pred_y)
)
pred_confidence = 1 - (npx.hmean(pred_proba, axis=1)*2)
high_confidence_indices = pred_confidence > 0.90
print 'accuracy={:%} on {:d} confident examples'.format(
metrics.accuracy_score(test_corpus.y[high_confidence_indices], pred_y[high_confidence_indices]),
high_confidence_indices.sum()
)
print
In [41]:
pred_y = logreg_model.predict(test_corpus.X)
pred_y
Out[41]:
In [20]:
pred_y_2 =
In [22]:
(pred_y == pred_y_2).mean()
Out[22]:
In [3]:
def get_balanced_labels():
balanced_indices = npx.balance(corpus.y == corpus.labels['For'],
corpus.y == corpus.labels['Against'])
X = corpus.X[balanced_indices, :]
y = corpus.y[balanced_indices]
documents = corpus.documents[balanced_indices]
tweets = [corpus.tweets[index] for index in balanced_indices]
two_classes = np.array([corpus.labels['For'], corpus.labels['Against']])
classes = corpus.classes
labels = corpus.labels
feature_names = corpus.feature_names
del corpus
del balanced_indices
In [ ]:
indices = npx.indices(y)
coefs_means = np.mean(bootstrap_coefs, axis=0)
coefs_variances = np.var(bootstrap_coefs, axis=0)
bootstrap_coefs = coefs_means
# bootstrap_coefs = coefs_means / (coefs_variances + 1)
# bootstrap_coefs = coefs_means / (coefs_variances + 1)**2
# bootstrap_transformed is just a simple dot product with each token's coefficient,
# which is a sum for each line-up of bag of words and coefficients
bootstrap_transformed = X.dot(bootstrap_coefs)
# we also want the variance of that, though, which may correlate with
# the overall confidence of the model, or better: may explain some of
# the residual that a simple dot product forgets about
logger.info('Computing dot-variance')
projections = X.toarray() * bootstrap_coefs
# bootstrap_transformed == projections.sum(axis=1)
projections_variance = projections.var(axis=1)
class_1_probabilities = npx.logistic(bootstrap_transformed)
bootstrap_pred_probabilities = np.column_stack((
1 - class_1_probabilities,
class_1_probabilities))
bootstrap_max_column = np.argmax(bootstrap_pred_probabilities, axis=1)
# class_columns = np.repeat(-1, classes.size)
# class_columns[two_classes] = npx.indices(two_classes)
# order = np.argsort(bootstrap_transformed)
# pred_probs's first column is the probability of the label classes[pred_y[index]]
# so we want to map pred_y[index], which is something like 3 or 4, to 0 or 1
bootstrap_pred_y = two_classes[bootstrap_max_column]
# bootstrap pred prob is the probability of the top class, over all classes
# we want the additive inverse of the harmonic mean, as a way of
# measuring the extremeness of the model's prediction
# npx.hmean([0.1, 0.9]) == .18, npx.hmean([0.5, 0.5] == .5
bootstrap_pred_confidence = 1 - (npx.hmean(bootstrap_pred_probabilities, axis=1)*2)
# bootstrap_pred_confidence represents the adjusted confidence of a class, i.e.,
# the probability of the chosen class but linearly normalized to somewhere between 0 and 1
# more linear, instead of harmonic mean:
# 1: min = 1 , nothing doing
# 2: min = .5, (x - 1/2)*(2/1) == 1 - (1 - x
# 3: min = .33, (x - 1/3)*(3/2)
# 4: min = .25, (x - 1/4)*(4/3)
# plt.plot(a, label='up')
# plt.plot(a, label='down')
# plt.plot(npx.hmean(np.column_stack((a, b, c)), axis=1), label='hmean')
# IPython.embed()
print 'Bootstrap overall accuracy: %.4f' % metrics.accuracy_score(y, bootstrap_pred_y)
# binned_accuracy(bootstrap_transformed, y, bootstrap_pred_y)
errors_mask = bootstrap_pred_y != y
logger.info('The model mis-predicted %d out of a total of %d', errors_mask.sum(), y.size)
bounds = npx.bounds(bootstrap_transformed)
print 'Transforms of mispredictions'
hist(bootstrap_transformed[errors_mask], bounds)
print 'Transforms of correct predictions'
hist(bootstrap_transformed[~errors_mask], bounds)
# hist((np.max(bootstrap_pred_probabilities[errors_mask], axis=1)-0.5)*2.0)
def render_predictions_histograms():
plt.cla()
plt.hist((np.max(bootstrap_pred_probabilities[errors_mask], axis=1)-0.5)*2.0, bins=25)
plt.title('Mispredictions')
plt.xlabel('Probability of assigned label')
plt.ylabel('# of tweets')
plt.gcf().set_size_inches(8, 5)
plt.savefig(figure_path('hist-proba-incorrect.pdf'))
# hist((np.max(bootstrap_pred_probabilities[~errors_mask], axis=1)-0.5)*2.0)
plt.cla()
plt.hist((np.max(bootstrap_pred_probabilities[~errors_mask], axis=1)-0.5)*2.0, bins=25)
plt.title('Correct predictions')
plt.xlabel('Probability of assigned label')
plt.ylabel('# of tweets')
plt.gcf().set_size_inches(8, 5)
plt.savefig(figure_path('hist-proba-correct.pdf'))
# errors_indices = balanced_indices[errors_mask]
# pred_pairs = np.column_stack((bootstrap_pred_y, y))
# Counter(zip(bootstrap_pred_y, y))
# confusion_matrix = metrics.confusion_matrix(y, bootstrap_pred_y, range(len(classes)))
# rownames = ['(Correct) ' + name for name in classes]
# print pd.DataFrame(confusion_matrix, index=rownames, columns=classes)
'''
Negative, in this case, means "For"
i.e., bootstrap_transformed[84] = -16.36, which shows the following SUPER "For"-SB5 tweet:
RT @GOHPBlog: ICYMI: Say 'YES' to Ohio jobs. Say 'YES' to
#Issue2 RT @BetterOhio: What Issue 2 means for Ohio Jobs: http://t.co/HJ8sL4l8 - #YesOn2
positive, means "Against", here's bootstrap_transformed[2905] = 10.62:
RT @ProgressOhio: Ohio Issue 2: 30 TV Stations Pull Misleading Anti-Union Ad [UPDATE]
http://t.co/BUxxH3yz #p2 #1U #SB5 #Issue2 #WeAreOhio #StandUpOh #NoOn2
'''
# pred_y is an array of labels, like [3, 4, 4, 3, 4 ... 3, 4, 4, 4, 4]
pred_y = bootstrap_pred_y
pred_confidence = bootstrap_pred_confidence
# def print_predictions(indices):
selected_indices = []
for index in selected_indices:
print '\nModel predicted %r, annotated as %r' % (classes[pred_y[index]], classes[y[index]])
print 'Confidence in prediction = %0.5f, variance in = %0.5f' % (
pred_confidence[index], projections_variance[index])
print ' ', documents[index]
print
print
# correct_indices_all = npx.bool_mask_to_indices(~errors_mask)
# randomly pick up to 50
# correct_indices_50 = np.random.choice(correct_indices_all, size=50, replace=False)
ordering = pred_confidence.argsort()
# ordering is from least confident to most confident
incorrect_indices_ordered_by_confidence = indices[ordering][errors_mask[ordering]]
correct_indices_ordered_by_confidence = indices[ordering][~errors_mask[ordering]]
print '50 most confident correct predictions'
selected_indices = correct_indices_ordered_by_confidence[-50:]
print '50 least confident correct predictions'
selected_indices = correct_indices_ordered_by_confidence[:50]
print '50 most confident incorrect predictions'
selected_indices = incorrect_indices_ordered_by_confidence[-50:]
print '50 least confident incorrect predictions'
selected_indices = incorrect_indices_ordered_by_confidence[:50]
# print 'correct_indices_50.shape:', correct_indices_50.shape, 'y.shape:', y.shape
# print 'bootstrap_pred_y.shape:', bootstrap_pred_y.shape, 'documents.shape', documents.shape
# indices[]
# classes[3]
for_indices = indices[y == labels['For']]
against_indices = indices[y == labels['Against']]
plt.figure(111)
plt.hist(pred_confidence[for_indices], bins=25)
plt.figure(112)
plt.hist(pred_confidence[against_indices], bins=25)
# estBetaParams <- function(mu, var) {
def fit_beta(xs):
# http://en.wikipedia.org/wiki/Beta_distribution
mean = np.mean(xs)
variance = np.var(xs)
bias = ((mean * (1 - mean)) / variance) - 1
alpha = mean * bias
beta = (1 - mean) * bias
return alpha, beta
def plot_beta(alpha, beta):
grid = np.arange(100) / 100.0
# dist = np.random.beta(alpha, beta)
# ys = np.ran
plt.plot(grid, scipy.stats.beta.pdf(grid, alpha, beta), color='r')
def data_with_beta(xs):
alpha, beta = fit_beta(xs)
plt.hist(xs)
plot_beta(alpha, beta)
beta_support = np.arange(100) / 100.0
plt.cla()
xs = pred_confidence[against_indices]
plt.hist(xs, normed=True, bins=25, color='r', alpha=0.5)
alpha, beta = fit_beta(xs)
plt.plot(beta_support, scipy.stats.beta.pdf(beta_support, alpha, beta), color='r', label='Against')
xs = pred_confidence[for_indices]
plt.hist(xs, normed=True, bins=25, color='b', alpha=0.5)
alpha, beta = fit_beta(xs)
plt.plot(beta_support, scipy.stats.beta.pdf(beta_support, alpha, beta), color='b', label='For')
plt.title('Confidence by label')
plt.xlabel('Higher = more confident')
plt.ylabel('Density')
plt.gcf().set_size_inches(8, 5)
plt.savefig(figure_path('confidence-by-label.pdf'))
# correct_indices_50
shell(); exit()
# most_extreme_to_least_extreme = np.argsort(-np.abs(bootstrap_transformed))
print_predictions(correct_indices_50, y, bootstrap_pred_y, bootstrap_pred_probabilities, documents)
# bootstrap_pred_probabilities
most_extreme_to_least_extreme = np.argsort(-np.abs(bootstrap_transformed))
# ordered_most_extreme_to_least_extreme
print 'Top 25 most extreme correct predictions'
selected_indices = npx.bool_mask_to_indices(~errors_mask)
print_predictions(selected_indices, y, bootstrap_pred_y, bootstrap_pred_probabilities, documents)
IPython.embed() or exit()
def old_describe(X, error_index):
x = X[error_index].toarray().ravel()
# could also just use the .indices of a CSR matrix
nonzero = x > 0
# total = coefs_means.dot(x) # = sum(values)
x_names = feature_names[nonzero]
x_means = x[nonzero] * coefs_means[nonzero]
x_variances = x[nonzero] * coefs_variances[nonzero]
reordering = np.argsort(x_means)
# [('', '='), ('total', ])
print viz.gloss.gloss(
[('', 'means ', 'vars')] + zip(
x_names[reordering],
map(flt, x_means[reordering]),
map(flt, x_variances[reordering]),
) +
[('', '%.2f' % sum(x_means), '%.2f' % sum(x_variances))]
)
# what is it that differentiates the hard-to-classify tweets near the middle vs.
# the tweets near the middle that it gets right?
biased_model = linear_model.LogisticRegression(fit_intercept=False)
biased_model.fit(X, y)
# biased_pred_y = biased_model.predict(X)
# print 'Biased overall accuracy: %.4f' % metrics.accuracy_score(y, biased_pred_y)