In [89]:
import IPython
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx
import viz
import viz.stats
import viz.format
from sklearn import metrics, cross_validation
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import svm
from sklearn import feature_selection
from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary
In [21]:
documents = Source.from_name('sb5b')
full_corpus = MulticlassCorpus(documents)
full_corpus.apply_labelfunc(lambda doc: doc.label)
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(polar_indices)
polar_corpus.extract_features(lambda doc: 1, features.intercept)
polar_corpus.extract_features(lambda doc: doc.document, features.ngrams,
ngram_max=2, min_df=2, max_df=1.0)
# ngram_max=2, min_df=0.001, max_df=0.95
Out[21]:
In [22]:
regularization = 'l2'
polar_model = linear_model.LogisticRegression(fit_intercept=False, penalty=regularization)
polar_model.fit(polar_corpus.X, polar_corpus.y)
# logreg_pred_y = polar_model.predict(polar_corpus.X)
# logreg_pred_proba = polar_model.predict_proba(polar_corpus.X)
Out[22]:
In [72]:
coef = polar_model.coef_.ravel()
coef_abs_ordering = np.argsort(np.abs(coef))
coef[coef_abs_ordering]
# from least to greatest
Out[72]:
In [74]:
viz.format.quantiles(coef, qs=[50, 75, 90, 95, 98, 99], width=100)
_ = plt.hist(np.abs(coef), bins=50, range=(0, 2))
In [76]:
def corpus_top_k_features_subset(corpus, model, k):
coef_abs_ordering = np.argsort(np.abs(model.coef_.ravel()))
features = coef_abs_ordering[-k:]
# top_k_df = pd.DataFrame.from_dict(dict(values=coef[top_k_features], names=polar_corpus.feature_names[top_k_features]))
return corpus.subset(features=features)
In [84]:
from tsa.science import summarization
# summarization.average_accuracy()
In [88]:
print 'regularization:', regularization
for k in [10, 20, 30, 40, 50, 100, 200, 250, 500, 750, 1000, 5000, 10000]:
subvocab_corpus = corpus_top_k_features_subset(polar_corpus, polar_model, k)
accuracy = summarization.average_accuracy(subvocab_corpus,
linear_model.LogisticRegression(fit_intercept=False, penalty='l2'))
print 'K = {:d}; accuracy = {:.2%}'.format(k, accuracy)
In [116]:
k_best_model = feature_selection.SelectKBest(score_func=feature_selection.f_classif, k=10)
k_best_model.fit(polar_corpus.X, polar_corpus.y)
Out[116]:
In [117]:
features = k_best_model.get_support(indices=True)
print 'support', features, polar_corpus.feature_names[features]
print 'scores_', k_best_model.scores_.shape, k_best_model.scores_
print 'pvalues_', k_best_model.pvalues_.shape, k_best_model.pvalues_
In [118]:
scores = k_best_model.scores_
_ = plt.hist(scores[~np.isnan(scores)], bins=50, log=True)
In [119]:
pvalues = k_best_model.pvalues_
_ = plt.hist(pvalues[~np.isnan(pvalues)], bins=50)
In [120]:
k_best_model.transform(polar_corpus.X)
Out[120]:
In [ ]:
print 'regularization:', regularization
# score_func = feature_selection.f_classif
# score_func = feature_selectionchi2
score_func = feature_selection.f_regression
#polar_corpus.X = polar_corpus.X.toarray() # required for f_regression
for k in [10, 20, 30, 40, 50, 100, 200, 250, 500, 750, 1000, 5000, 10000]:
# maybe SelectFdr to tune Precision/Recall ?
k_best_model = feature_selection.SelectKBest(score_func=score_func, k=k)
k_best_model.fit(polar_corpus.X, polar_corpus.y)
features = k_best_model.get_support(indices=True)
k_best_corpus = corpus_features_subset(polar_corpus, features)
accuracy = corpus_mean_accuracy(k_best_corpus, penalty=regularization, n_iter=10)
# print 'X ~ {:s}; features: {:s}'.format(subvocab_corpus.X.shape, subvocab_corpus.feature_names)
print 'K = {:d}; accuracy = {:.2%}'.format(k, accuracy)
In [134]:
# Recursive feature elimination
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
print 'Using regularization:', regularization
rfe_estimator = linear_model.LogisticRegression(fit_intercept=False, penalty=regularization)
rfe_model = feature_selection.RFE(rfe_estimator, n_features_to_select=1000, step=0.25)
rfe_model.fit(polar_corpus.X, polar_corpus.y)
Out[134]:
In [142]:
print rfe_model.get_support(indices=True)[:10], '...'
In [138]:
features = rfe_model.get_support()
rfe_corpus = corpus_features_subset(polar_corpus, features)
print rfe_corpus.feature_names[:20], '...'
accuracy = corpus_mean_accuracy(rfe_corpus, penalty=regularization, n_iter=10)
print 'K = {:d}; accuracy = {:.2%}'.format(len(features), accuracy)
In [ ]:
#polar_corpus.X = polar_corpus.X.toarray()
for k in [10, 20, 30, 40, 50, 100, 200, 250, 500, 750, 1000, 5000, 10000]:
rfe_model = feature_selection.RFE(rfe_estimator, n_features_to_select=k)
features = rfe_model.get_support()
rfe_corpus = corpus_features_subset(polar_corpus, features)
print rfe_corpus.feature_names
accuracy = corpus_mean_accuracy(rfe_corpus, penalty=regularization, n_iter=10)
# print 'X ~ {:s}; features: {:s}'.format(subvocab_corpus.X.shape, subvocab_corpus.feature_names)
print 'K = {:d}; accuracy = {:.2%}'.format(k, accuracy)