In [7]:
import IPython
import re
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx
from collections import Counter

import viz

from sklearn import metrics, cross_validation
from sklearn import linear_model

from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary

In [2]:
documents = Source.from_name('sb5b')
full_corpus = MulticlassCorpus(documents)
full_corpus.apply_labelfunc(lambda doc: doc.label)
# empty X will still have a shape of (1, 0)
intercept_features = full_corpus.extract_features(lambda doc: 1, features.intercept)
print full_corpus


<MulticlassCorpus X = (106702, 1), y = (106702,)>

In [9]:
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(rows=polar_indices)
print polar_corpus


<MulticlassCorpus X = (13627, 1), y = (13627,)>

In [10]:
# liwc_counts, liwc_categories = features.liwc([doc.document for doc in full_corpus.data])
liwc_features      = polar_corpus.extract_features(lambda doc: doc.document, features.liwc)
ngrams_features    = polar_corpus.extract_features(lambda doc: doc.document,
    features.ngrams, ngram_max=2, min_df=2, max_df=1.0)
# ngram_max=2, min_df=0.001, max_df=0.95
all_features = np.concatenate([intercept_features, liwc_features, ngrams_features])
print polar_corpus


<MulticlassCorpus X = (13627, 43449), y = (13627,)>

In [14]:
posemo_features = npx.bool_mask_to_indices(polar_corpus.feature_names == 'posemo')
negemo_features = npx.bool_mask_to_indices(polar_corpus.feature_names == 'negemo')
emo_features    = npx.bool_mask_to_indices(
    np.in1d(polar_corpus.feature_names, ['posemo', 'negemo']))
# liwc_features, all_features
print posemo_features, negemo_features, emo_features


[43] [38] [38 43]
Out[14]:
(1,)

In [15]:
def corpus_mean_accuracy(corpus, penalty='l2', test_size=0.1, n_iter=10):
    folds = cross_validation.StratifiedShuffleSplit(corpus.y, test_size=test_size, n_iter=n_iter)
    accuracies = []
    for fold_index, (train_indices, test_indices) in enumerate(folds):
        train_corpus = corpus.subset(train_indices)
        test_corpus = corpus.subset(test_indices)
    
        model = linear_model.LogisticRegression(fit_intercept=False, penalty=penalty)
        model.fit(train_corpus.X, train_corpus.y)
        pred_y = model.predict(test_corpus.X)
        accuracy = metrics.accuracy_score(test_corpus.y, pred_y)
        accuracies += [accuracy]
    return np.mean(accuracies)

In [18]:
feature_sets = [
                ('Unigrams, bigrams, LIWC', all_features),
                ('Unigrams, bigrams', ngrams_features),
                ('LIWC (all)', liwc_features),
                ('LIWC (posemo, negemo)', emo_features),
                ('LIWC (posemo)', posemo_features),
                ('LIWC (negemo)', negemo_features),
                ('Baseline', intercept_features),
               ]

print '{:s} & {:s} & {:s} \\\\'.format('Features', 'Number of features', 'Accuracy')
for name, selected_features in feature_sets:
    subcorpus = polar_corpus.subset(features=selected_features)
    accuracy = corpus_mean_accuracy(subcorpus)    
    print '{:s} & {:d} & {:.2%} \\\\'.format(
        name, selected_features.size, accuracy).replace('%', '\\%')


Features & Number of features & Accuracy \\
Unigrams, bigrams, LIWC & 43449 & 96.10\% \\
Unigrams, bigrams & 43384 & 96.09\% \\
LIWC (all) & 64 & 81.67\% \\
LIWC (posemo, negemo) & 2 & 52.58\% \\
LIWC (posemo) & 1 & 41.75\% \\
LIWC (negemo) & 1 & 39.33\% \\
Baseline & 1 & 79.53\% \\

In [55]:
posemo_corpus = polar_corpus.subset(features=emo_features)
Counter(posemo_corpus.y)
print 'majority_class = {:.2%}'.format(10842. / (10842. + 2785.))
# print posemo_corpus.X.toarray()


majority_class = 79.56%

posemo / negemo correlation

We want to show that posemo counts does not correlate with For / Against, but much more with volume.


In [14]:
times = np.array([doc.published for doc in full_corpus.data]).astype('datetime64[s]')

model = linear_model.LogisticRegression(fit_intercept=False, penalty='l2')
model.fit(polar_corpus.X[:, ngrams_features], polar_corpus.y)
full_corpus_pred_y = model.predict(full_corpus.X[:, ngrams_features])


[4 4 4 ..., 4 0 0]

In [27]:
print times.size, full_corpus_pred_y.size
full_corpus_pred_labels = full_corpus.labels[full_corpus_pred_y]

values = full_corpus_pred_labels.reshape(-1, 1)

bin_edges, bin_values = timeseries.binned_timeseries(
    times, values,
    time_units_per_bin=7, time_unit='D', statistic='count')
print bin_edges
print bin_values.ravel()


14383 14383
['2011-07-28T19:00:00-0500' '2011-08-04T19:00:00-0500'
 '2011-08-11T19:00:00-0500' '2011-08-18T19:00:00-0500'
 '2011-08-25T19:00:00-0500' '2011-09-01T19:00:00-0500'
 '2011-09-08T19:00:00-0500' '2011-09-15T19:00:00-0500'
 '2011-09-22T19:00:00-0500' '2011-09-29T19:00:00-0500'
 '2011-10-06T19:00:00-0500' '2011-10-13T19:00:00-0500'
 '2011-10-20T19:00:00-0500' '2011-10-27T19:00:00-0500'
 '2011-11-03T19:00:00-0500'] [  350.   449.   640.   570.   578.   437.   439.   522.   443.   732.
   951.  1239.  1960.  1969.  3104.]

In [30]:
bins = dict(total=bin_values.ravel())
for label in ['For', 'Against']:
    indices = full_corpus_pred_y == full_corpus.class_lookup[label]
    # by week

    bin_edges, bin_values = timeseries.binned_timeseries(
        times[indices], values[indices],
        time_units_per_bin=7, time_unit='D', statistic='count')
    bin_values = bin_values.ravel()
    print bin_edges, bin_values
    # bin_values = npx.exponential_decay(bin_values.ravel(), window=14, alpha=0.75)
#     plt.plot(bin_edges, bin_values, label=label, **styles.next())

# datetime64_formatter = datetime_extra.datetime64_formatter
# axes = plt.gca()
# axes.xaxis.set_major_formatter(ticker.FuncFormatter(datetime_extra.datetime64_formatter))
# axes.grid(False)


['2011-07-28T19:00:00-0500' '2011-08-04T19:00:00-0500'
 '2011-08-11T19:00:00-0500' '2011-08-18T19:00:00-0500'
 '2011-08-25T19:00:00-0500' '2011-09-01T19:00:00-0500'
 '2011-09-08T19:00:00-0500' '2011-09-15T19:00:00-0500'
 '2011-09-22T19:00:00-0500' '2011-09-29T19:00:00-0500'
 '2011-10-06T19:00:00-0500' '2011-10-13T19:00:00-0500'
 '2011-10-20T19:00:00-0500' '2011-10-27T19:00:00-0500'
 '2011-11-03T19:00:00-0500'] [  99.   94.   91.  123.   94.   29.   34.   30.   56.   92.   78.  170.
  653.  812.  673.]
['2011-07-28T19:00:00-0500' '2011-08-04T19:00:00-0500'
 '2011-08-11T19:00:00-0500' '2011-08-18T19:00:00-0500'
 '2011-08-25T19:00:00-0500' '2011-09-01T19:00:00-0500'
 '2011-09-08T19:00:00-0500' '2011-09-15T19:00:00-0500'
 '2011-09-22T19:00:00-0500' '2011-09-29T19:00:00-0500'
 '2011-10-06T19:00:00-0500' '2011-10-13T19:00:00-0500'
 '2011-10-20T19:00:00-0500' '2011-10-27T19:00:00-0500'
 '2011-11-03T19:00:00-0500'] [  251.   355.   549.   447.   484.   408.   405.   492.   387.   640.
   873.  1069.  1307.  1157.  2431.]

In [ ]:
for liwc_category in ['posemo', 'negemo']:
    plt.cla()
    styles = distinct_styles()
    counts = liwc_counts[:, liwc_categories.index(liwc_category)].toarray()
    time_hist('Overall %s' % liwc_category, full_corpus_times, counts,
        statistic='sum', **styles.next())
    for label in ['For', 'Against']:
        indices = full_pred_y == full_corpus.class_lookup[label]
        time_hist('%s-class %s' % (label, liwc_category),
            full_corpus_times[indices], counts[indices], statistic='sum', **styles.next())
    plt.title('LIWC category: %s' % liwc_category)
    plt.ylabel('Frequency')
    plt.xlabel('Date')
    axes = plt.gca()
    axes.xaxis.set_major_formatter(ticker.FuncFormatter(datetime_extra.datetime64_formatter))
    axes.grid(False)
    plt.xlim(np.array(npx.bounds(full_corpus_times)).astype(float))
    plt.gcf().set_size_inches(8, 5)
    plt.legend(loc='best')
    plt.savefig(figure_path('liwc-%s-for-vs-against.pdf' % liwc_category))

raise IPython.embed()

# convert vector to column matrix
values = full_pred_y.reshape((-1, 1))

plt.cla()

LIWC visualization


In [1]:
from lexicons import Liwc

In [5]:
lexicon = Liwc()
document = """I do live in OH, \& yes, it's awful. NO on \#Issue2  RT @RachelAnneLevy: I don't live in Ohio but from what i can tell \#sb5 sounds awful."""

In [16]:
counter = Counter(lexicon.read_document(document))
print counter


Counter({u'funct': 14, u'pronoun': 5, u'verb': 4, u'auxverb': 4, u'preps': 4, u'present': 4, u'ppron': 3, u'i': 3, u'space': 3, u'relativ': 3, u'cogmech': 2, u'ipron': 2, u'assent': 2, u'negemo': 2, u'negate': 2, u'affect': 2, u'hear': 1, u'percept': 1, u'conj': 1, u'social': 1, u'excl': 1})

In [18]:
for category, count in counter.most_common(100):
    print '%s\t%s' % (category, count)


funct	14
pronoun	5
verb	4
auxverb	4
preps	4
present	4
ppron	3
i	3
space	3
relativ	3
cogmech	2
ipron	2
assent	2
negemo	2
negate	2
affect	2
hear	1
percept	1
conj	1
social	1
excl	1

In [11]:
for match in re.finditer(r"[a-z]['a-z]*", document, re.I):
    token = match.group(0)
    matches = [match for match in lexicon.read_token(token.lower())]
    print '%s & %s' % (token, ' & '.join(matches))


I & funct & pronoun & ppron & i
do & verb & funct & auxverb & present
live & 
in & funct & preps & space & relativ
OH & assent
yes & assent
it's & funct & pronoun & ipron & verb & auxverb & present
awful & affect & negemo
NO & funct & negate
on & funct & preps & space & relativ
Issue & cogmech
RT & 
RachelAnneLevy & 
I & funct & pronoun & ppron & i
don't & verb & funct & auxverb & present & negate
live & 
in & funct & preps & space & relativ
Ohio & 
but & funct & conj & cogmech & excl
from & funct & preps
what & funct & pronoun & ipron
i & funct & pronoun & ppron & i
can & verb & funct & auxverb & present
tell & social
sb & 
sounds & percept & hear
awful & affect & negemo

In [ ]: