In [1]:

    
import IPython
from IPython.display import HTML

import numpy as np
import pandas as pd
from scipy import sparse
from tsa.science import numpy_ext as npx
from collections import Counter

import viz

from sklearn import metrics, cross_validation
from sklearn import linear_model, svm, naive_bayes
from sklearn import feature_selection

from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra, cache
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary

import tsa.science.models
from tsa.science.models import Bootstrap, SelectKBest



In [26]:

    
full_corpus = MulticlassCorpus(Source.from_name('sb5b', labeled_only=True))
full_corpus.apply_labelfunc(lambda doc: doc.label)
full_corpus.extract_features(lambda doc: 1, features.intercept)
full_corpus.extract_features(lambda doc: doc.document, features.ngrams,
    ngram_max=2, min_df=2, max_df=1.0)









    Out[26]:





array([    1,     2,     3, ..., 48118, 48119, 48120])



In [27]:

    
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(rows=polar_indices)



In [28]:

    
balanced_indices = npx.balance(polar_corpus.y == full_corpus.class_lookup['For'],
                               polar_corpus.y == full_corpus.class_lookup['Against'])
balanced_corpus = polar_corpus.subset(rows=balanced_indices)

We want to see how much we can improve accuracy if we let ourselves throw out some of the data. So, we take only the data with confidence above a certain threshold, and evaluate accuracy on that subset.



In [33]:

    
bootstrap_model = Bootstrap(linear_model.LogisticRegression, n_iter=100, proportion=1.0,
          fit_intercept=False, penalty='l2', C=1.0)
logreg_model = linear_model.LogisticRegression(fit_intercept=False, penalty='l2')



In [43]:

    
corpus = polar_corpus



In [44]:

    
folds = cross_validation.StratifiedShuffleSplit(corpus.y, test_size=0.1, n_iter=10)
folds = list(folds)



In [47]:

    
for train_indices, test_indices in folds:
    train_corpus = corpus.subset(train_indices)
    test_corpus = corpus.subset(test_indices)
    logreg_model.fit(train_corpus.X, train_corpus.y)
    
    pred_proba = logreg_model.predict_proba(test_corpus.X)
    pred_y = logreg_model.classes_[np.argmax(pred_proba, axis=1)]
    print 'accuracy={:%} on all {:d} examples'.format(
        metrics.accuracy_score(test_corpus.y, pred_y),
        len(pred_y)
    )

    pred_confidence = 1 - (npx.hmean(pred_proba, axis=1)*2)
    high_confidence_indices = pred_confidence > 0.90
    
    print 'accuracy={:%} on {:d} confident examples'.format(
        metrics.accuracy_score(test_corpus.y[high_confidence_indices], pred_y[high_confidence_indices]),
        high_confidence_indices.sum()
    )
    print









    



accuracy=96.404989% on all 1363 examples
accuracy=99.309665% on 1014 confident examples

accuracy=95.891416% on all 1363 examples
accuracy=99.219512% on 1025 confident examples

accuracy=96.258254% on all 1363 examples
accuracy=99.036609% on 1038 confident examples

accuracy=95.891416% on all 1363 examples
accuracy=99.319066% on 1028 confident examples

accuracy=96.258254% on all 1363 examples
accuracy=99.306931% on 1010 confident examples

accuracy=96.478357% on all 1363 examples
accuracy=99.216454% on 1021 confident examples

accuracy=96.625092% on all 1363 examples
accuracy=99.415774% on 1027 confident examples

accuracy=96.184886% on all 1363 examples
accuracy=99.224806% on 1032 confident examples

accuracy=96.625092% on all 1363 examples
accuracy=99.606686% on 1017 confident examples

accuracy=96.111519% on all 1363 examples
accuracy=99.496982% on 994 confident examples



In [41]:

    
pred_y = logreg_model.predict(test_corpus.X)
pred_y









    Out[41]:





array([4, 0, 4, 4, 4, 4, 0, 4, 0, 0, 0, 0, 4, 4, 4, 4, 0, 4, 0, 0, 0, 0, 4,
       0, 4, 4, 0, 0, 4, 0, 0, 4, 4, 0, 0, 4, 0, 0, 0, 0, 4, 0, 4, 0, 0, 4,
       0, 4, 4, 0, 4, 4, 0, 0, 4, 4, 4, 4, 0, 4, 0, 4, 4, 4, 0, 0, 4, 0, 0,
       4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 4, 4, 4, 0, 4, 0, 0, 4, 0, 4, 4, 4,
       0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 4, 4, 4, 0, 4, 0, 4, 0, 0, 4, 4, 0,
       4, 4, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0,
       4, 0, 0, 0, 0, 0, 4, 4, 4, 0, 4, 4, 0, 4, 0, 4, 0, 0, 0, 4, 4, 0, 4,
       0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 4, 0, 4, 0, 0, 0, 4, 4, 4, 0,
       0, 4, 0, 4, 4, 4, 4, 4, 4, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4,
       0, 4, 4, 0, 0, 4, 0, 4, 0, 0, 0, 0, 0, 4, 4, 4, 4, 0, 4, 4, 0, 4, 0,
       4, 0, 0, 4, 4, 4, 0, 0, 4, 4, 0, 0, 4, 4, 0, 0, 0, 4, 0, 0, 4, 4, 4,
       4, 4, 4, 0, 0, 0, 4, 0, 0, 4, 4, 0, 4, 4, 4, 4, 4, 4, 0, 4, 0, 0, 4,
       4, 4, 4, 4, 0, 0, 0, 4, 4, 4, 0, 0, 4, 0, 0, 4, 4, 0, 0, 4, 4, 0, 0,
       4, 4, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 4, 4, 4, 0, 4, 0, 4, 4, 4, 0, 0,
       0, 0, 0, 4, 4, 4, 4, 0, 0, 4, 0, 4, 0, 0, 4, 4, 4, 0, 4, 4, 4, 4, 4,
       0, 0, 4, 0, 4, 4, 4, 4, 4, 0, 0, 0, 4, 0, 0, 4, 4, 4, 0, 4, 0, 0, 4,
       0, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 0, 4, 0, 4, 4, 4, 0, 4, 0, 0, 0, 0,
       0, 0, 0, 4, 4, 0, 0, 4, 4, 0, 0, 4, 4, 0, 0, 4, 4, 0, 4, 0, 0, 0, 0,
       0, 0, 4, 0, 0, 4, 4, 0, 4, 4, 4, 4, 4, 0, 4, 4, 0, 0, 4, 4, 0, 4, 0,
       0, 4, 0, 4, 0, 0, 4, 4, 0, 0, 4, 0, 4, 0, 0, 4, 4, 4, 0, 0, 4, 4, 0,
       0, 4, 0, 0, 4, 4, 4, 0, 4, 4, 0, 0, 4, 0, 0, 4, 4, 0, 4, 0, 0, 0, 0,
       0, 4, 4, 0, 0, 0, 0, 4, 0, 4, 0, 4, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 4,
       0, 4, 4, 0, 4, 4, 4, 4, 0, 4, 0, 0, 0, 4, 4, 4, 4, 4, 0, 4, 4, 0, 0,
       4, 4, 0, 0, 0, 0, 0, 4, 4, 4, 0, 4, 4, 4, 4, 4, 0, 0, 0, 4, 4, 0, 4,
       4, 0, 0, 4])



In [20]:

    
pred_y_2 =



In [22]:

    
(pred_y == pred_y_2).mean()









    Out[22]:





1.0



In [3]:

    
def get_balanced_labels():
    balanced_indices = npx.balance(corpus.y == corpus.labels['For'],
                                   corpus.y == corpus.labels['Against'])
    X = corpus.X[balanced_indices, :]
    y = corpus.y[balanced_indices]
    documents = corpus.documents[balanced_indices]
    tweets = [corpus.tweets[index] for index in balanced_indices]
    two_classes = np.array([corpus.labels['For'], corpus.labels['Against']])
    classes = corpus.classes
    labels = corpus.labels
    feature_names = corpus.feature_names
    del corpus
    del balanced_indices



In [ ]:

    
indices = npx.indices(y)
coefs_means = np.mean(bootstrap_coefs, axis=0)
coefs_variances = np.var(bootstrap_coefs, axis=0)

bootstrap_coefs = coefs_means
# bootstrap_coefs = coefs_means / (coefs_variances + 1)
# bootstrap_coefs = coefs_means / (coefs_variances + 1)**2

# bootstrap_transformed is just a simple dot product with each token's coefficient,
# which is a sum for each line-up of bag of words and coefficients
bootstrap_transformed = X.dot(bootstrap_coefs)
# we also want the variance of that, though, which may correlate with
# the overall confidence of the model, or better: may explain some of
# the residual that a simple dot product forgets about
logger.info('Computing dot-variance')
projections = X.toarray() * bootstrap_coefs
# bootstrap_transformed == projections.sum(axis=1)
projections_variance = projections.var(axis=1)

class_1_probabilities = npx.logistic(bootstrap_transformed)
bootstrap_pred_probabilities = np.column_stack((
    1 - class_1_probabilities,
    class_1_probabilities))
bootstrap_max_column = np.argmax(bootstrap_pred_probabilities, axis=1)

# class_columns = np.repeat(-1, classes.size)
# class_columns[two_classes] = npx.indices(two_classes)
# order = np.argsort(bootstrap_transformed)

# pred_probs's first column is the probability of the label classes[pred_y[index]]
# so we want to map pred_y[index], which is something like 3 or 4, to 0 or 1
bootstrap_pred_y = two_classes[bootstrap_max_column]
# bootstrap pred prob is the probability of the top class, over all classes
# we want the additive inverse of the harmonic mean, as a way of
# measuring the extremeness of the model's prediction
# npx.hmean([0.1, 0.9]) == .18, npx.hmean([0.5, 0.5] == .5
bootstrap_pred_confidence = 1 - (npx.hmean(bootstrap_pred_probabilities, axis=1)*2)
# bootstrap_pred_confidence represents the adjusted confidence of a class, i.e.,
# the probability of the chosen class but linearly normalized to somewhere between 0 and 1
# more linear, instead of harmonic mean:
# 1: min = 1  , nothing doing
# 2: min =  .5, (x - 1/2)*(2/1) == 1 - (1 - x
# 3: min = .33, (x - 1/3)*(3/2)
# 4: min = .25, (x - 1/4)*(4/3)


# plt.plot(a, label='up')
# plt.plot(a, label='down')
# plt.plot(npx.hmean(np.column_stack((a, b, c)), axis=1), label='hmean')

# IPython.embed()

print 'Bootstrap overall accuracy: %.4f' % metrics.accuracy_score(y, bootstrap_pred_y)
# binned_accuracy(bootstrap_transformed, y, bootstrap_pred_y)

errors_mask = bootstrap_pred_y != y
logger.info('The model mis-predicted %d out of a total of %d', errors_mask.sum(), y.size)

bounds = npx.bounds(bootstrap_transformed)
print 'Transforms of mispredictions'
hist(bootstrap_transformed[errors_mask], bounds)
print 'Transforms of correct predictions'
hist(bootstrap_transformed[~errors_mask], bounds)

# hist((np.max(bootstrap_pred_probabilities[errors_mask], axis=1)-0.5)*2.0)
def render_predictions_histograms():
    plt.cla()
    plt.hist((np.max(bootstrap_pred_probabilities[errors_mask], axis=1)-0.5)*2.0, bins=25)
    plt.title('Mispredictions')
    plt.xlabel('Probability of assigned label')
    plt.ylabel('# of tweets')
    plt.gcf().set_size_inches(8, 5)
    plt.savefig(figure_path('hist-proba-incorrect.pdf'))

    # hist((np.max(bootstrap_pred_probabilities[~errors_mask], axis=1)-0.5)*2.0)
    plt.cla()
    plt.hist((np.max(bootstrap_pred_probabilities[~errors_mask], axis=1)-0.5)*2.0, bins=25)
    plt.title('Correct predictions')
    plt.xlabel('Probability of assigned label')
    plt.ylabel('# of tweets')
    plt.gcf().set_size_inches(8, 5)
    plt.savefig(figure_path('hist-proba-correct.pdf'))

# errors_indices = balanced_indices[errors_mask]
# pred_pairs = np.column_stack((bootstrap_pred_y, y))
# Counter(zip(bootstrap_pred_y, y))

# confusion_matrix = metrics.confusion_matrix(y, bootstrap_pred_y, range(len(classes)))
# rownames = ['(Correct) ' + name for name in classes]
# print pd.DataFrame(confusion_matrix, index=rownames, columns=classes)

'''
Negative, in this case, means "For"
i.e., bootstrap_transformed[84] = -16.36, which shows the following SUPER "For"-SB5 tweet:
    RT @GOHPBlog: ICYMI: Say 'YES' to Ohio jobs. Say 'YES' to
    #Issue2 RT @BetterOhio: What Issue 2 means for Ohio Jobs: http://t.co/HJ8sL4l8 - #YesOn2
positive, means "Against", here's bootstrap_transformed[2905] = 10.62:
    RT @ProgressOhio: Ohio Issue 2: 30 TV Stations Pull Misleading Anti-Union Ad [UPDATE]
    http://t.co/BUxxH3yz #p2 #1U #SB5 #Issue2 #WeAreOhio #StandUpOh #NoOn2
'''


# pred_y is an array of labels, like [3, 4, 4, 3, 4 ... 3, 4, 4, 4, 4]
pred_y = bootstrap_pred_y
pred_confidence = bootstrap_pred_confidence

# def print_predictions(indices):
selected_indices = []
for index in selected_indices:
    print '\nModel predicted %r, annotated as %r' % (classes[pred_y[index]], classes[y[index]])
    print 'Confidence in prediction = %0.5f, variance in = %0.5f' % (
        pred_confidence[index], projections_variance[index])
    print ' ', documents[index]
    print
    print

# correct_indices_all = npx.bool_mask_to_indices(~errors_mask)
# randomly pick up to 50
# correct_indices_50 = np.random.choice(correct_indices_all, size=50, replace=False)
ordering = pred_confidence.argsort()
# ordering is from least confident to most confident
incorrect_indices_ordered_by_confidence = indices[ordering][errors_mask[ordering]]
correct_indices_ordered_by_confidence = indices[ordering][~errors_mask[ordering]]

print '50 most confident correct predictions'
selected_indices = correct_indices_ordered_by_confidence[-50:]

print '50 least confident correct predictions'
selected_indices = correct_indices_ordered_by_confidence[:50]

print '50 most confident incorrect predictions'
selected_indices = incorrect_indices_ordered_by_confidence[-50:]

print '50 least confident incorrect predictions'
selected_indices = incorrect_indices_ordered_by_confidence[:50]

# print 'correct_indices_50.shape:', correct_indices_50.shape, 'y.shape:', y.shape
# print 'bootstrap_pred_y.shape:', bootstrap_pred_y.shape, 'documents.shape', documents.shape
# indices[]
# classes[3]
for_indices = indices[y == labels['For']]
against_indices = indices[y == labels['Against']]
plt.figure(111)
plt.hist(pred_confidence[for_indices], bins=25)
plt.figure(112)
plt.hist(pred_confidence[against_indices], bins=25)


# estBetaParams <- function(mu, var) {
def fit_beta(xs):
    # http://en.wikipedia.org/wiki/Beta_distribution
    mean = np.mean(xs)
    variance = np.var(xs)
    bias = ((mean * (1 - mean)) / variance) - 1
    alpha = mean * bias
    beta = (1 - mean) * bias
    return alpha, beta



def plot_beta(alpha, beta):
    grid = np.arange(100) / 100.0
    # dist = np.random.beta(alpha, beta)
    # ys = np.ran
    plt.plot(grid, scipy.stats.beta.pdf(grid, alpha, beta), color='r')

def data_with_beta(xs):
    alpha, beta = fit_beta(xs)
    plt.hist(xs)
    plot_beta(alpha, beta)


beta_support = np.arange(100) / 100.0



plt.cla()
xs = pred_confidence[against_indices]
plt.hist(xs, normed=True, bins=25, color='r', alpha=0.5)
alpha, beta = fit_beta(xs)
plt.plot(beta_support, scipy.stats.beta.pdf(beta_support, alpha, beta), color='r', label='Against')

xs = pred_confidence[for_indices]
plt.hist(xs, normed=True, bins=25, color='b', alpha=0.5)
alpha, beta = fit_beta(xs)
plt.plot(beta_support, scipy.stats.beta.pdf(beta_support, alpha, beta), color='b', label='For')

plt.title('Confidence by label')
plt.xlabel('Higher = more confident')
plt.ylabel('Density')
plt.gcf().set_size_inches(8, 5)
plt.savefig(figure_path('confidence-by-label.pdf'))



# correct_indices_50
shell(); exit()

# most_extreme_to_least_extreme = np.argsort(-np.abs(bootstrap_transformed))
print_predictions(correct_indices_50, y, bootstrap_pred_y, bootstrap_pred_probabilities, documents)
# bootstrap_pred_probabilities





most_extreme_to_least_extreme = np.argsort(-np.abs(bootstrap_transformed))
# ordered_most_extreme_to_least_extreme
print 'Top 25 most extreme correct predictions'
selected_indices = npx.bool_mask_to_indices(~errors_mask)
print_predictions(selected_indices, y, bootstrap_pred_y, bootstrap_pred_probabilities, documents)

IPython.embed() or exit()


def old_describe(X, error_index):
    x = X[error_index].toarray().ravel()
    # could also just use the .indices of a CSR matrix
    nonzero = x > 0
    # total = coefs_means.dot(x)  # = sum(values)
    x_names = feature_names[nonzero]
    x_means = x[nonzero] * coefs_means[nonzero]
    x_variances = x[nonzero] * coefs_variances[nonzero]
    reordering = np.argsort(x_means)
    # [('', '='), ('total', ])
    print viz.gloss.gloss(
        [('', 'means  ', 'vars')] + zip(
            x_names[reordering],
            map(flt, x_means[reordering]),
            map(flt, x_variances[reordering]),
        ) +
        [('', '%.2f' % sum(x_means), '%.2f' % sum(x_variances))]
    )

# what is it that differentiates the hard-to-classify tweets near the middle vs.
# the tweets near the middle that it gets right?

biased_model = linear_model.LogisticRegression(fit_intercept=False)
biased_model.fit(X, y)
# biased_pred_y = biased_model.predict(X)
# print 'Biased overall accuracy: %.4f' % metrics.accuracy_score(y, biased_pred_y)