In [1]:
import IPython
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx
from collections import Counter

import viz

from sklearn import metrics, cross_validation
from sklearn import linear_model

from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary

In [2]:
documents = Source.from_name('sb5b')
full_corpus = MulticlassCorpus(documents)
full_corpus.apply_labelfunc(lambda doc: doc.label)
print full_corpus


<MulticlassCorpus X = (1, 0), y = (106702,)>

In [3]:
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(rows=polar_indices)
polar_corpus.extract_features(lambda doc: 1, features.intercept)
polar_corpus.extract_features(lambda doc: doc.document, features.ngrams,
    ngram_max=2, min_df=2, max_df=1.0)
print polar_corpus


<MulticlassCorpus X = (13627, 43385), y = (13627,)>

In [8]:
def test(corpus, n_iter=20, penalty='l2'):
    folds = cross_validation.StratifiedShuffleSplit(corpus.y, test_size=0.1, n_iter=n_iter)
    for train_indices, test_indices in folds:
        train_corpus = corpus.subset(train_indices)
        test_corpus = corpus.subset(test_indices)

        model = linear_model.LogisticRegression(fit_intercept=False, penalty=penalty)
        model.fit(train_corpus.X, train_corpus.y)
        pred_y = model.predict(test_corpus.X)
        yield metrics.accuracy_score(test_corpus.y, pred_y)

In [10]:
accuracies = list(test(polar_corpus))
print 'Mean accuracy: {:.1%}'.format(np.mean(accuracies))


Mean accuracy: 96.2%