In [8]:
import IPython
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx
from sklearn import metrics, cross_validation
from sklearn import linear_model
from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary
from tsa.data.sb5b import notable_events
In [9]:
documents = Source.from_name('sb5b')
full_corpus = MulticlassCorpus(documents)
full_corpus.apply_labelfunc(lambda doc: doc.label or 'Unlabeled')
full_corpus.extract_features(lambda doc: 1, features.intercept)
full_corpus.extract_features(lambda doc: doc.document,
features.ngrams, ngram_max=2, min_df=2, max_df=1.0)
full_corpus.datetimes = np.array([doc.published for doc in full_corpus.data]).astype('datetime64[s]')
print full_corpus
In [10]:
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(rows=polar_indices)
print polar_corpus
# unlabeled_corpus = full_corpus.subset(full_corpus.y == full_corpus.class_lookup['Unlabeled'])
In [11]:
# notable_events_labels, notable_events_dates = zip(*notable_events)
labeled_times = full_corpus.datetimes[polar_indices]
labeled_time_bounds = np.array(npx.bounds(labeled_times))
In [12]:
penalty = 'l1'
model = linear_model.LogisticRegression(fit_intercept=False, penalty=penalty)
model.fit(polar_corpus.X, polar_corpus.y)
full_pred_y = model.predict(full_corpus.X)
full_pred_proba = model.predict_proba(full_corpus.X)
# full_pred_proba_max = full_pred_proba.max(axis=1)
# full_pred_proba_hmean = npx.hmean(full_pred_proba, axis=1)
# unlabeled_pred_y = logreg_model.predict(unlabeled_corpus.X)
full_pred_y
Out[12]:
In [6]:
def time_hist(label, times, values, time_units_per_bin=2, time_unit='D', statistic='count', **style_args):
bin_edges, bin_values = timeseries.binned_timeseries(
times, values,
time_units_per_bin=time_units_per_bin,
time_unit=time_unit, statistic=statistic)
plt.plot(bin_edges, bin_values, label=label, **style_args)
histogram of for/against across entire period
In [7]:
label_styles = [
('For', dict(linewidth=2, linestyle='-', color='red')),
('Against', dict(linewidth=2, linestyle='--', color='blue')),
]
for label, styles in label_styles:
datetimes = full_corpus.datetimes[full_pred_y == full_corpus.class_lookup[label]]
time_hist(label, datetimes, datetimes.reshape(-1, 1), 7, 'D', **styles)
plt.title('For / Against labels throughout corpus')
plt.ylabel('Frequency')
plt.xlabel('Date')
# plt.vlines(notable_dates.astype(float), *auto_ylim)
plt.axvspan(*labeled_time_bounds.astype(float), edgecolor='none', facecolor='g', alpha=0.05)
plt.gcf().set_size_inches(8, 5)
plt.gca().yaxis.set_ticks([])
plt.gca().xaxis.grid(False) # hide vertical gridlines
plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(datetime_extra.datetime64_formatter))
plt.legend(loc='best')
# plt.axis('tight')
plt.margins(0.025, tight=False)
auto_ylim = plt.ylim()
for i, (label, date) in enumerate(notable_events):
# http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.axvline
x = date.astype('datetime64[s]').astype(float)
plt.axvline(x, color='k')
plt.text(x, auto_ylim[1]*(0.9 - i * 0.1), '- ' + label)
# plt.savefig(figure_path('for-against-extrapolated.pdf'))
In [52]:
# plt.vlines(np.array(notable_events_dates).astype('datetime64[s]').astype(float),
# *auto_ylim, colors='k')
In [ ]:
plt.cla()
styles = distinct_styles()
time_hist('', full_corpus_times, full_pred_proba_max.reshape(-1, 1),
statistic='mean', **styles.next())
# plt.legend(loc='best')
plt.title('Average certainty of prediction')
plt.xlabel('Date')
plt.axvspan(*labeled_time_bounds.astype(float), edgecolor='none', facecolor='g', alpha=0.05)
plt.gcf().set_size_inches(8, 5)
axes = plt.gca()
axes.xaxis.set_major_formatter(ticker.FuncFormatter(datetime_extra.datetime64_formatter))
plt.savefig(figure_path('predict-proba-extrapolated.pdf'))
In [ ]: