In [8]:
import IPython
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx

from sklearn import metrics, cross_validation
from sklearn import linear_model

from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary

from tsa.data.sb5b import notable_events

In [9]:
documents = Source.from_name('sb5b')
full_corpus = MulticlassCorpus(documents)
full_corpus.apply_labelfunc(lambda doc: doc.label or 'Unlabeled')
full_corpus.extract_features(lambda doc: 1, features.intercept)
full_corpus.extract_features(lambda doc: doc.document,
    features.ngrams, ngram_max=2, min_df=2, max_df=1.0)

full_corpus.datetimes = np.array([doc.published for doc in full_corpus.data]).astype('datetime64[s]')

print full_corpus


<MulticlassCorpus X = (106702, 202137), y = (106702,)>

In [10]:
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(rows=polar_indices)
print polar_corpus
# unlabeled_corpus = full_corpus.subset(full_corpus.y == full_corpus.class_lookup['Unlabeled'])


<MulticlassCorpus X = (13627, 202137), y = (13627,)>

In [11]:
# notable_events_labels, notable_events_dates = zip(*notable_events)
labeled_times = full_corpus.datetimes[polar_indices]
labeled_time_bounds = np.array(npx.bounds(labeled_times))

In [12]:
penalty = 'l1'
model = linear_model.LogisticRegression(fit_intercept=False, penalty=penalty)
model.fit(polar_corpus.X, polar_corpus.y)

full_pred_y = model.predict(full_corpus.X)
full_pred_proba = model.predict_proba(full_corpus.X)
# full_pred_proba_max = full_pred_proba.max(axis=1)
# full_pred_proba_hmean = npx.hmean(full_pred_proba, axis=1)
# unlabeled_pred_y = logreg_model.predict(unlabeled_corpus.X)
full_pred_y


Out[12]:
array([3, 3, 3, ..., 3, 3, 3])

In [6]:
def time_hist(label, times, values, time_units_per_bin=2, time_unit='D', statistic='count', **style_args):
    bin_edges, bin_values = timeseries.binned_timeseries(
        times, values,
        time_units_per_bin=time_units_per_bin,
        time_unit=time_unit, statistic=statistic)
    plt.plot(bin_edges, bin_values, label=label, **style_args)

histogram of for/against across entire period


In [7]:
label_styles = [
                ('For', dict(linewidth=2, linestyle='-', color='red')),
                ('Against', dict(linewidth=2, linestyle='--', color='blue')),
               ]

for label, styles in label_styles:
    datetimes = full_corpus.datetimes[full_pred_y == full_corpus.class_lookup[label]]
    time_hist(label, datetimes, datetimes.reshape(-1, 1), 7, 'D', **styles)
    
plt.title('For / Against labels throughout corpus')
plt.ylabel('Frequency')
plt.xlabel('Date')
# plt.vlines(notable_dates.astype(float), *auto_ylim)
plt.axvspan(*labeled_time_bounds.astype(float), edgecolor='none', facecolor='g', alpha=0.05)
plt.gcf().set_size_inches(8, 5)
plt.gca().yaxis.set_ticks([])
plt.gca().xaxis.grid(False) # hide vertical gridlines
plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(datetime_extra.datetime64_formatter))
plt.legend(loc='best')
# plt.axis('tight')
plt.margins(0.025, tight=False)

auto_ylim = plt.ylim()

for i, (label, date) in enumerate(notable_events):
    # http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.axvline
    x = date.astype('datetime64[s]').astype(float)
    plt.axvline(x, color='k') 
    plt.text(x, auto_ylim[1]*(0.9 - i * 0.1), '- ' + label)

# plt.savefig(figure_path('for-against-extrapolated.pdf'))



In [52]:
# plt.vlines(np.array(notable_events_dates).astype('datetime64[s]').astype(float),
#     *auto_ylim, colors='k')

In [ ]:
plt.cla()
styles = distinct_styles()
time_hist('', full_corpus_times, full_pred_proba_max.reshape(-1, 1),
    statistic='mean', **styles.next())
# plt.legend(loc='best')
plt.title('Average certainty of prediction')
plt.xlabel('Date')
plt.axvspan(*labeled_time_bounds.astype(float), edgecolor='none', facecolor='g', alpha=0.05)
plt.gcf().set_size_inches(8, 5)
axes = plt.gca()
axes.xaxis.set_major_formatter(ticker.FuncFormatter(datetime_extra.datetime64_formatter))
plt.savefig(figure_path('predict-proba-extrapolated.pdf'))

In [ ]: