In [89]:
import IPython
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx

import viz
import viz.stats
import viz.format

from sklearn import metrics, cross_validation
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import svm
from sklearn import feature_selection

from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary

In [21]:
documents = Source.from_name('sb5b')
full_corpus = MulticlassCorpus(documents)
full_corpus.apply_labelfunc(lambda doc: doc.label)
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(polar_indices)
polar_corpus.extract_features(lambda doc: 1, features.intercept)
polar_corpus.extract_features(lambda doc: doc.document, features.ngrams,
    ngram_max=2, min_df=2, max_df=1.0)
    # ngram_max=2, min_df=0.001, max_df=0.95


Out[21]:
array([    1,     2,     3, ..., 43382, 43383, 43384])

first, find the top 10 features, limit the vocabulary to them, and retest


In [22]:
regularization = 'l2'

polar_model = linear_model.LogisticRegression(fit_intercept=False, penalty=regularization)
polar_model.fit(polar_corpus.X, polar_corpus.y)

# logreg_pred_y = polar_model.predict(polar_corpus.X)
# logreg_pred_proba = polar_model.predict_proba(polar_corpus.X)


Out[22]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [72]:
coef = polar_model.coef_.ravel()
coef_abs_ordering = np.argsort(np.abs(coef))
coef[coef_abs_ordering]
# from least to greatest


Out[72]:
array([  1.71409430e-08,   1.71409430e-08,   1.71409430e-08, ...,
         2.39119361e+00,   2.42147155e+00,   4.81855113e+00])

In [74]:
viz.format.quantiles(coef, qs=[50, 75, 90, 95, 98, 99], width=100)
_ = plt.hist(np.abs(coef), bins=50, range=(0, 2))


50% < -0.01195   75% < 0.002635   90% < 0.126295   95% < 0.212882   98% < 0.339948   99% < 0.434979

In [76]:
def corpus_top_k_features_subset(corpus, model, k):
    coef_abs_ordering = np.argsort(np.abs(model.coef_.ravel()))
    features = coef_abs_ordering[-k:]
    # top_k_df = pd.DataFrame.from_dict(dict(values=coef[top_k_features], names=polar_corpus.feature_names[top_k_features]))
    return corpus.subset(features=features)

In [84]:
from tsa.science import summarization
# summarization.average_accuracy()

In [88]:
print 'regularization:', regularization
for k in [10, 20, 30, 40, 50, 100, 200, 250, 500, 750, 1000, 5000, 10000]:
    subvocab_corpus = corpus_top_k_features_subset(polar_corpus, polar_model, k)
    accuracy = summarization.average_accuracy(subvocab_corpus,
        linear_model.LogisticRegression(fit_intercept=False, penalty='l2'))
    print 'K = {:d}; accuracy = {:.2%}'.format(k, accuracy)


regularization: l2
K = 10; accuracy = 90.70%
K = 20; accuracy = 90.67%
K = 30; accuracy = 91.58%
K = 40; accuracy = 91.82%
K = 50; accuracy = 92.00%
K = 100; accuracy = 94.13%
K = 200; accuracy = 94.59%
K = 250; accuracy = 94.67%
K = 500; accuracy = 95.63%
K = 750; accuracy = 96.14%
K = 1000; accuracy = 96.77%
K = 5000; accuracy = 96.79%
K = 10000; accuracy = 96.69%

Doing things right. Sort of. Better.

Working with sklearn's feature_selection module.

These objects take as input a scoring function that returns univariate p-values:

    For regression: f_regression
    For classification: chi2 or f_classif

In [116]:
k_best_model = feature_selection.SelectKBest(score_func=feature_selection.f_classif, k=10)
k_best_model.fit(polar_corpus.X, polar_corpus.y)


Out[116]:
SelectKBest(k=10, score_func=<function f_classif at 0x10b444d70>)

In [117]:
features = k_best_model.get_support(indices=True)
print 'support', features, polar_corpus.feature_names[features]
print 'scores_', k_best_model.scores_.shape, k_best_model.scores_
print 'pvalues_', k_best_model.pvalues_.shape, k_best_model.pvalues_


support [ 1920 31726 32439 35833 40023 40365 42881 42901 42919 42945] [u'addthis' u'rt gohpblog' u'sb5' u'tcot' u'via addthis' u'vote yes' u'yes'
 u'yes on' u'yeson2' u'yeson2 issue2']
scores_ (43385,) [        nan  2.31342392  0.51376222 ...,  0.51376222  0.77071443
  0.77071443]
pvalues_ (43385,) [        nan  0.12828445  0.47352702 ...,  0.47352702  0.38001161
  0.38001161]

In [118]:
scores = k_best_model.scores_
_ = plt.hist(scores[~np.isnan(scores)], bins=50, log=True)



In [119]:
pvalues = k_best_model.pvalues_
_ = plt.hist(pvalues[~np.isnan(pvalues)], bins=50)



In [120]:
k_best_model.transform(polar_corpus.X)


Out[120]:
<13627x10 sparse matrix of type '<type 'numpy.float64'>'
	with 13535 stored elements in Compressed Sparse Row format>

In [ ]:
print 'regularization:', regularization
# score_func = feature_selection.f_classif
# score_func = feature_selectionchi2
score_func = feature_selection.f_regression
#polar_corpus.X = polar_corpus.X.toarray()  # required for f_regression
for k in [10, 20, 30, 40, 50, 100, 200, 250, 500, 750, 1000, 5000, 10000]:
    # maybe SelectFdr to tune Precision/Recall ?
    k_best_model = feature_selection.SelectKBest(score_func=score_func, k=k)
    k_best_model.fit(polar_corpus.X, polar_corpus.y)
    features = k_best_model.get_support(indices=True)
    k_best_corpus = corpus_features_subset(polar_corpus, features)

    accuracy = corpus_mean_accuracy(k_best_corpus, penalty=regularization, n_iter=10)
    # print 'X ~ {:s}; features: {:s}'.format(subvocab_corpus.X.shape, subvocab_corpus.feature_names)
    print 'K = {:d}; accuracy = {:.2%}'.format(k, accuracy)


regularization: l2
K = 10; accuracy = 90.74%
K = 20; accuracy = 91.83%
K = 30; accuracy = 92.23%
K = 40; accuracy = 93.02%
K = 50; accuracy = 93.53%
K = 100; accuracy = 93.24%
K = 200; accuracy = 93.87%
K = 250; accuracy = 93.82%
K = 500; accuracy = 94.64%
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-127-f9bec6ff79db> in <module>()
      6 for k in [10, 20, 30, 40, 50, 100, 200, 250, 500, 750, 1000, 5000, 10000]:
      7     k_best_model = feature_selection.SelectKBest(score_func=score_func, k=k)
----> 8     k_best_model.fit(polar_corpus.X, polar_corpus.y)
      9     features = k_best_model.get_support(indices=True)
     10     k_best_corpus = corpus_features_subset(polar_corpus, features)

/Library/Python/2.7/site-packages/scikit_learn-0.14.1-py2.7-macosx-10.9-intel.egg/sklearn/feature_selection/univariate_selection.pyc in fit(self, X, y)
    313         Records and selects features according to their scores.
    314         """
--> 315         self.scores_, self.pvalues_ = self.score_func(X, y)
    316         self.scores_ = np.asarray(self.scores_)
    317         self.pvalues_ = np.asarray(self.pvalues_)

/Library/Python/2.7/site-packages/scikit_learn-0.14.1-py2.7-macosx-10.9-intel.egg/sklearn/feature_selection/univariate_selection.pyc in f_regression(X, y, center)
    249     if center:
    250         y = y - np.mean(y)
--> 251         X = X.copy('F')  # faster in fortran
    252         X -= X.mean(axis=0)
    253 

KeyboardInterrupt: 


In [134]:
# Recursive feature elimination
#   http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
print 'Using regularization:', regularization
rfe_estimator = linear_model.LogisticRegression(fit_intercept=False, penalty=regularization)
rfe_model = feature_selection.RFE(rfe_estimator, n_features_to_select=1000, step=0.25)
rfe_model.fit(polar_corpus.X, polar_corpus.y)


Using regularization: l2
Out[134]:
RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),
  estimator_params={}, n_features_to_select=1000, step=0.25, verbose=0)

In [142]:
print rfe_model.get_support(indices=True)[:10], '...'


[  0   1   6  21  40  41 171 214 233 240] ...

In [138]:
features = rfe_model.get_support()
rfe_corpus = corpus_features_subset(polar_corpus, features)
print rfe_corpus.feature_names[:20], '...'
accuracy = corpus_mean_accuracy(rfe_corpus, penalty=regularization, n_iter=10)
print 'K = {:d}; accuracy = {:.2%}'.format(len(features), accuracy)


[u'#intercept#' u'0' u'1' u'1 http://t.co/gjmslohe' u'1,000,000'
 u'1,000,000 for' u'15' u'1983 law' u'1u' u'1u p2' u'2 does' u'2 early'
 u'2 sb5' u'2 vote' u'25' u'3' u'3 in' u'4' u'5 will' u'54'] ...
K = 43385; accuracy = 96.35%

In [ ]:
#polar_corpus.X = polar_corpus.X.toarray()
for k in [10, 20, 30, 40, 50, 100, 200, 250, 500, 750, 1000, 5000, 10000]:
    rfe_model = feature_selection.RFE(rfe_estimator, n_features_to_select=k)
    features = rfe_model.get_support()
    rfe_corpus = corpus_features_subset(polar_corpus, features)
    print rfe_corpus.feature_names
    accuracy = corpus_mean_accuracy(rfe_corpus, penalty=regularization, n_iter=10)
    # print 'X ~ {:s}; features: {:s}'.format(subvocab_corpus.X.shape, subvocab_corpus.feature_names)
    print 'K = {:d}; accuracy = {:.2%}'.format(k, accuracy)