In [10]:
import IPython
from IPython.display import HTML

import numpy as np
import pandas as pd
from scipy import sparse
from tsa.science import numpy_ext as npx
from collections import Counter

import viz

from sklearn import metrics, cross_validation
from sklearn import linear_model, svm, naive_bayes
from sklearn import feature_selection

from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra, cache
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary

In [33]:
import tsa.science.models
reload(tsa.science.models)
from tsa.science.models import Bootstrap, SelectKBest

In [4]:
full_corpus = MulticlassCorpus(Source.from_name('sb5b', labeled_only=True))
full_corpus.apply_labelfunc(lambda doc: doc.label)
full_corpus.extract_features(lambda doc: 1, features.intercept)
full_corpus.extract_features(lambda doc: doc.document, features.ngrams,
    ngram_max=2, min_df=2, max_df=1.0)

polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(rows=polar_indices)

In [5]:
def extreme_features(feature_names, feature_values, margin=10):
    # Looking at the extremes, based on feature_values
    #   sort descending
    ordering = np.argsort(-feature_values)
    ordered_feature_names = feature_names[ordering]
    ordered_feature_values = feature_values[ordering]
    
    n_features = feature_names.size
    slices = [
              ('Largest', 0, margin),
              ('Middle', (n_features // 2) - (margin // 4), margin // 2),
              ('Smallest', n_features - margin, margin),
             ]

    for name, start, length in slices:
        yield name, ordered_feature_values[start:start + length], ordered_feature_names[start:start + length]

In [39]:
models = [
    ('Anova', SelectKBest(score_func=feature_selection.f_classif, k='all')),      
    ('Bootstrap', Bootstrap(linear_model.LogisticRegression, n_iter=1000, proportion=1.0,
                            fit_intercept=False, penalty='l2', C=1.0)),
    ('Logistic Regression (L2)', linear_model.LogisticRegression(penalty='l2',
                                                                 fit_intercept=False)),
    ('Logistic Regression (L2) (C=100)', linear_model.LogisticRegression(penalty='l2', C=100.0, 
                                                                         fit_intercept=False)),
    ('Logistic Regression (L1)', linear_model.LogisticRegression(penalty='l1', 
                                                                 fit_intercept=False)),
    # ('randomized_logistic_regression', linear_model.RandomizedLogisticRegression()),
    ('Perceptron (L2)', linear_model.Perceptron(penalty='l2', fit_intercept=False)),
    ('Perceptron (L1)', linear_model.Perceptron(penalty='l1', fit_intercept=False)),
    ('Linear SVC (L2)', svm.LinearSVC(penalty='l2', fit_intercept=False)),
    ('Linear SVC (L1)', svm.LinearSVC(penalty='l1', dual=False, fit_intercept=False)),
    ('Naive Bayes', naive_bayes.MultinomialNB()),
]
models = models[:1]

In [40]:
# pred_y = model.predict(test_corpus.X)
# ordering = np.argsort(-np.abs(coefs))
extremes = dict(Largest=pd.DataFrame(), Middle=pd.DataFrame(), Smallest=pd.DataFrame())
for model_name, model in models:
    model.fit(polar_corpus.X, polar_corpus.y)
    print model_name, 'coef_.shape:', model.coef_.shape
    for key, values, names in extreme_features(polar_corpus.feature_names, model.coef_.ravel(), margin=10):
        extremes[key][model_name] = values
        extremes[key][model_name + '-names'] = names

    # printer = tabular.Printer(FS=' & ', RS='\\\\\n')
    # printer.write(row_dict)


Anova coef_.shape: (44457,)

In [41]:
extremes_df = pd.concat(extremes, axis=0)
print extremes_df.to_latex(float_format=lambda x: '%5.2f' % x)


\begin{tabular}{lrl}
\toprule
{} &   Anova &     Anova-names \\
Largest  0 & 5404.26 &          yeson2 \\
\midrule
         1 & 1650.25 &             yes \\
         2 & 1193.46 &             sb5 \\
         3 & 1151.19 &            tcot \\
         4 & 1105.84 &          yes on \\
         5 &  836.10 &        vote yes \\
         6 &  825.18 &   yeson2 issue2 \\
         7 &  759.76 &     rt gohpblog \\
         8 &  752.93 &         addthis \\
         9 &  752.93 &     via addthis \\
Middle   0 &    0.77 &      seems i've \\
         1 &    0.77 &    seems doomed \\
         2 &    0.77 &  seemingly just \\
         3 &    0.77 &     tonight sb5 \\
         4 &    0.77 &       seemingly \\
Smallest 0 &     nan &         in 2007 \\
         1 &     nan &      a computer \\
         2 &     nan &            gitu \\
         3 &     nan &       work into \\
         4 &     nan &    blog posting \\
         5 &     nan &    a bellwether \\
         6 &     nan &          blzzrd \\
         7 &     nan &           ii of \\
         8 &     nan &         outlook \\
         9 &     nan &     \#intercept\# \\
\bottomrule
\end{tabular}


In [46]:
# model = Bootstrap(linear_model.LogisticRegression, n_iter=1000, proportion=1.0, fit_intercept=False, penalty='l2', C=1.0)
# model.fit(polar_corpus.X, polar_corpus.y)
model_name = 'Bootstrap-Variance'
extremes = dict(Largest=pd.DataFrame(), Middle=pd.DataFrame(), Smallest=pd.DataFrame())
feature_values = np.var(model.coefs_, axis=0)
for key, values, names in extreme_features(polar_corpus.feature_names, feature_values, margin=10):
    extremes[key][model_name] = values
    extremes[key][model_name + '-names'] = names
extremes_df = pd.concat(extremes, axis=0)
print extremes_df.to_latex(float_format=lambda x: '%5.2f' % x)


\begin{tabular}{lrl}
\toprule
{} &  Bootstrap-Variance &     Bootstrap-Variance-names \\
Largest  0 &                0.33 &                rbschueler rt \\
\midrule
         1 &                0.27 &                        2 sb5 \\
         2 &                0.27 &                   supports a \\
         3 &                0.26 &                   sb5 yeson2 \\
         4 &                0.24 &                 gohpblog via \\
         5 &                0.22 &                         tour \\
         6 &                0.21 &                           st \\
         7 &                0.21 &     http://t.co/zxuhbfff sb5 \\
         8 &                0.21 &         http://t.co/zxuhbfff \\
         9 &                0.21 &  issue2 http://t.co/zxuhbfff \\
Middle   0 &                0.00 &                          21k \\
         1 &                0.00 &          http://t.co/gv24doa \\
         2 &                0.00 &      http://t.co/gv24doa sb5 \\
         3 &                0.00 &                life services \\
         4 &                0.00 &                     facts of \\
Smallest 0 &                0.00 &                       kburdz \\
         1 &                0.00 &                kburdz hahhaa \\
         2 &                0.00 &               interview with \\
         3 &                0.00 &                      slippin \\
         4 &                0.00 &                           ke \\
         5 &                0.00 &                   slippin if \\
         6 &                0.00 &                        belah \\
         7 &                0.00 &                         dooh \\
         8 &                0.00 &                   simply the \\
         9 &                0.00 &                  posting x30 \\
\bottomrule
\end{tabular}


In [19]:
print extremes_df.to_latex(float_format=lambda x: '%5.2f' % x)


Out[19]:
Bootstrap Bootstrap-names
Largest 0 4.846574 yeson2
1 2.343425 tcot
2 2.168048 mt notgvn
3 2.118053 new post
4 1.767368 yes
5 1.725772 rt rbschueler
6 1.577330 rbschueler
7 1.478200 addthis
8 1.478200 via addthis
9 1.408466 post
Middle 0 -0.009972 short but
1 -0.009972 friends http://t.co/bkj7cnzd
2 -0.009974 we_are_ohio 30
3 -0.009984 to hell
4 -0.009986 bargaining remains
Smallest 0 -0.987966 sb5 issue2
1 -1.093064 weareohio
2 -1.108477 repeal
3 -1.126398 standupoh
4 -1.159269 fans
5 -1.313132 #intercept#
6 -1.345831 rt notgvn
7 -1.477294 1u
8 -1.558545 no on
9 -2.074094 noon2

25 rows × 2 columns


In [ ]:
for k in [10, 20, 30, 40, 50, 100, 200, 250, 500, 750, 1000, 5000, 10000]:
    subvocab_corpus = corpus_top_k_features_subset(polar_corpus, polar_model, k)
    accuracy = corpus_mean_accuracy(subvocab_corpus, penalty=regularization, n_iter=10)

In [37]:
k_best_model = SelectKBest(score_func=feature_selection.f_classif, k=10)
k_best_model.fit(polar_corpus.X, polar_corpus.y)
plt.hist(k_best_model.coef_)


Out[37]:
array([37759,  7245,  9976, ...,  5406, 19131,     0])

In [26]:
f_values, p_values = feature_selection.f_classif(polar_corpus.X, polar_corpus.y)
f_values = f_values[~np.isnan(f_values)]
p_values = p_values[~np.isnan(p_values)]

In [42]:
_ = plt.hist(f_values, log=True, bins=50)
f_values.max()


Out[42]:
5404.2595524861472

In [47]:
plt.hist(anova_f_value)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-47-e45744d43c81> in <module>()
----> 1 plt.hist(anova_f_value)

/Library/Python/2.7/site-packages/matplotlib-override/matplotlib/pyplot.pyc in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, **kwargs)
   2825                       histtype=histtype, align=align, orientation=orientation,
   2826                       rwidth=rwidth, log=log, color=color, label=label,
-> 2827                       stacked=stacked, **kwargs)
   2828         draw_if_interactive()
   2829     finally:

/Library/Python/2.7/site-packages/matplotlib-override/matplotlib/axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
   8324             # this will automatically overwrite bins,
   8325             # so that each histogram uses the same bins
-> 8326             m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
   8327             m = m.astype(float) # causes problems later if it's an int
   8328             if mlast is None:

/Library/Python/2.7/site-packages/numpy-1.8.0-py2.7-macosx-10.9-intel.egg/numpy/lib/function_base.pyc in histogram(a, bins, range, normed, weights, density)
    163         if (mn > mx):
    164             raise AttributeError(
--> 165                 'max must be larger than min in range parameter.')
    166 
    167     if not iterable(bins):

AttributeError: max must be larger than min in range parameter.

Looking more closely at coefficients...


In [49]:
model = linear_model.LogisticRegression(penalty='l2', fit_intercept=False)
model.fit(polar_corpus.X, polar_corpus.y)


Out[49]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [115]:
coefs = np.array(model.coef_).ravel()
print coefs.shape, polar_corpus.X.shape
# transforms = 
pred_proba = model.predict_proba(polar_corpus.X)


(44457,) (13627, 44457)

In [119]:



Out[119]:
array([-2.94242193, -5.00651379, -2.04951597, ...,  3.30586113,
       -3.46296837, -2.72529591])

In [125]:
import viz
def gloss_datum(corpus, index, coefs):
    document = corpus.data[index]
    x = corpus.X[index].toarray().ravel()
    #prob = pred_proba[index]
    nonzero_features = x > 0
    nonzero_feature_names = corpus.feature_names[nonzero_features]
    projection = x * coefs
    nonzero_values = projection[nonzero_features]
    #     reordering = np.argsort(x_coefs)
    pairs = zip(nonzero_feature_names, ['%.2f' % x for x in nonzero_values])
    fulltext = document.document.replace('\n', ' ')
    return document.label, fulltext, pd.DataFrame.from_dict(
        dict(tokens=np.concatenate((nonzero_feature_names, ['SUM'])),
             values=np.concatenate((nonzero_values, [sum(projection)])))).T

In [ ]:
#     print
    #     print '--- %s ---' % test_corpus.labels[test_corpus.y[index]]
#     print '%s (%s)' % ( document.label)
    #     print dict(zip(corpus.labels[model.classes_], prob))
#     print viz.gloss.gloss([('', 'means')] + pairs + [('SUM', )])

In [126]:
pred_proba, npx.hmean(np.array([[ 0.9999 ,  0.0001 ]]), axis=1)


Out[126]:
(array([[ 0.9499041 ,  0.0500959 ],
       [ 0.99335031,  0.00664969],
       [ 0.8858987 ,  0.1141013 ],
       ..., 
       [ 0.03537066,  0.96462934],
       [ 0.96961554,  0.03038446],
       [ 0.9385029 ,  0.0614971 ]]),
 array([ 0.00019998]))

In [144]:
values = npx.hmean(pred_proba, axis=1)
values = polar_corpus.X.dot(coefs)
ordering = np.argsort(values)
selected_indices = ordering[-20:]

for selected_index in selected_indices:
#     print selected_index, values[selected_index]

    label, text, table = gloss_datum(polar_corpus, selected_index, coefs)
    print selected_index, label, text, values[selected_index]


10720 For YES on #SB5: What government unions don't want you to know - YouTube http://t.co/IUXxKIsi via @addthis #Issue2 #YesOn2 #Ohio #tcot 13.4984163064
10227 For YES on #SB5: What government unions don't want you to know - YouTube http://t.co/IUXxKIsi via @addthis #Issue2 #YesOn2 #Ohio #tcot 13.4984163064
9656 For RT @rbschueler: RT @laborunionrpt The Cost of Voting No on #Ohio #Issue2 http://t.co/sNDJ2WEL via @jasonahart #tcot #LUR #teaparty #SB5 #YesOn2 13.5451784441
7711 For Unions Hire Researcher With A Taste For Being Paid Off | GOHP Blog http://t.co/h3ujxKRm via @addthis / #YesOn2 #Issue2 #Ohio #sb5 #p2 #tcot 13.5453253553
11746 For Bold Colors: Why I Will Vote Yes on #Issue2 / SB 5 (Part 3) http://t.co/NIgVVUy8 via @addthis #YesOn2 #sb5 #ohtcot #tcot #bcot #sgp #ohio 13.5584117509
12863 For Union Thugs Want Your Children | Godfather Politics http://t.co/WJ9MmeRn via @addthis #education #tcot #YesOn2 #Issue2 #schoolchoice 13.65662581
7797 For PolitiFact | We Are Ohio's file http://t.co/QQUbZMf9 via @addthis / Get the #FACTS on #Issue2 #YesOn2 #Ohio #ohtcot #tcot #p2 #tlot #bcot 13.7652619833
9393 For Cincinnati Enquirer Endorses a YES on #Issue2 | Building a Better Ohio http://t.co/CVxfeWdx via @addthis #YesOn2 #Ohio #tcot #bcot #sgp 13.802435217
11422 For Hamilton Co. Chief Deputy Sheriff Supports #Issue2 - YouTube http://t.co/ltyweTiF via @addthis Vote #YESon2 #Ohio #tcot #sgp #bcot #tpp 13.8229150405
12070 For RT @cutiefool: #OurTP Yes on ISSUES 2 & 3 #YesOn2 #YesOn3 #OH #Issue2 #Issue3 http://t.co/1vB0tVoI @OurTPorg  #TCOT 13.8985366912
10850 For RT @FairTaxNancy: Support Ohio's farmers; vote yes on #Issue2 http://t.co/HCqe3ZWH via @addthis #sb5 #YesOn2 #ohio #tcot 14.0216947218
10851 For RT @FairTaxNancy: Support Ohio's farmers; vote yes on #Issue2 http://t.co/HCqe3ZWH via @addthis #sb5 #YesOn2 #ohio #tcot 14.0216947218
8544 For RT @rbschueler: RT @GOHPBlog Bottom Line: A YES vote on #Issue2 will save tax dollars, save jobs and hold government accountable. #YesOn2 #FF 14.1315284239
8605 For RT @rbschueler: RT @laborunionrpt Unions To Ohio Taxpayers: We Will Bury You #tcot #LUR #unions #Ohio #Yeson2 #SB5 #Issue2 #ohtcot http://t.co/AzuVMnEA 14.2888087841
8537 For RT @rbschueler: RT @FreedomWorks Ohio Union Lies, Half-truths, and RACISM! http://t.co/P29bYKiO via @JasonAHart Vote YES on #Issue2 ! #SB5 #YesOn2 14.3024516465
8868 For RT @SpielzOnWheels: If you support #schoolchoice and real education reform urge OH to vote yes on #issue2.  Vote Yes on 2. #yeson2 14.5528087418
10229 For RT @FairTaxNancy: YES on #SB5: What government unions don't want you to know - YouTube http://t.co/IUXxKIsi via @addthis #Issue2 #YesOn2 #Ohio #tcot 15.2966330775
9398 For RT @FairTaxNancy: Cincinnati Enquirer Endorses a YES on #Issue2 | Building a Better Ohio http://t.co/CVxfeWdx via @addthis #YesOn2 #Ohio #tcot #bcot #sgp 15.5958035235
9396 For RT @FairTaxNancy: Cincinnati Enquirer Endorses a YES on #Issue2 | Building a Better Ohio http://t.co/CVxfeWdx via @addthis #YesOn2 #Ohio #tcot #bcot #sgp 15.5958035235
11423 For RT @FairTaxNancy: Hamilton Co. Chief Deputy Sheriff Supports #Issue2 - YouTube http://t.co/ltyweTiF via @addthis Vote #YESon2 #Ohio #tcot #sgp #bcot #tpp 15.6162762168

In [145]:
selected_index = 9656
print int(selected_index)
label, text, table = gloss_datum(polar_corpus, selected_index, coefs)
print label, text
HTML(table.to_html(float_format=lambda x: '%.2f' % x))


9656
For RT @rbschueler: RT @laborunionrpt The Cost of Voting No on #Ohio #Issue2 http://t.co/sNDJ2WEL via @jasonahart #tcot #LUR #teaparty #SB5 #YesOn2
Out[145]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
tokens #intercept# cost cost of issue2 jasonahart laborunionrpt lur no no on of of voting ohio ohio issue2 on on ohio rbschueler rbschueler rt rt rt laborunionrpt rt rbschueler sb5 sb5 yeson2 tcot tcot lur teaparty teaparty sb5 the the cost via via jasonahart voting voting no yeson2 SUM
values -1.30 0.59 0.71 -0.11 0.76 0.47 0.88 -0.83 -1.60 -0.16 0.66 0.05 -0.19 -0.03 0.76 1.63 -0.65 0.71 0.39 1.86 -0.88 -0.36 2.39 0.00 0.74 0.01 0.00 0.67 0.18 0.59 0.34 0.42 4.82 13.55

In [148]:
print table.T.to_latex(float_format=lambda x: '%.2f' % x)


\begin{tabular}{lll}
\toprule
{} &            tokens & values \\
\midrule
0  &       \#intercept\# &  -1.30 \\
1  &              cost &   0.59 \\
2  &           cost of &   0.71 \\
3  &            issue2 &  -0.11 \\
4  &        jasonahart &   0.76 \\
5  &     laborunionrpt &   0.47 \\
6  &               lur &   0.88 \\
7  &                no &  -0.83 \\
8  &             no on &  -1.60 \\
9  &                of &  -0.16 \\
10 &         of voting &   0.66 \\
11 &              ohio &   0.05 \\
12 &       ohio issue2 &  -0.19 \\
13 &                on &  -0.03 \\
14 &           on ohio &   0.76 \\
15 &        rbschueler &   1.63 \\
16 &     rbschueler rt &  -0.65 \\
17 &                rt &   0.71 \\
18 &  rt laborunionrpt &   0.39 \\
19 &     rt rbschueler &   1.86 \\
20 &               sb5 &  -0.88 \\
21 &        sb5 yeson2 &  -0.36 \\
22 &              tcot &   2.39 \\
23 &          tcot lur &   0.00 \\
24 &          teaparty &   0.74 \\
25 &      teaparty sb5 &   0.01 \\
26 &               the &   0.00 \\
27 &          the cost &   0.67 \\
28 &               via &   0.18 \\
29 &    via jasonahart &   0.59 \\
30 &            voting &   0.34 \\
31 &         voting no &   0.42 \\
32 &            yeson2 &   4.82 \\
33 &               SUM &  13.55 \\
\bottomrule
\end{tabular}


In [96]:
nonzero_feature_names = polar_corpus.feature_names[1:9]
np.concatenate((nonzero_feature_names, ['hi']))

In [97]:
nonzero_feature_names


Out[97]:
array([u'0', u'0 1', u'0 issue2', u'00.00', u'00.00 the', u'03', u'04',
       u'08'], 
      dtype='<U61')

In [101]:



Out[101]:
array([u'0', u'0 1', u'0 issue2', u'00.00', u'00.00 the', u'03', u'04',
       u'08', u'hi'], 
      dtype='<U61')

In [ ]: