In [7]:
import IPython
import re
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx
from collections import Counter
import viz
from sklearn import metrics, cross_validation
from sklearn import linear_model
from tsa import stdout, stderr
from tsa.lib import tabular, datetime_extra
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary
In [2]:
documents = Source.from_name('sb5b')
full_corpus = MulticlassCorpus(documents)
full_corpus.apply_labelfunc(lambda doc: doc.label)
# empty X will still have a shape of (1, 0)
intercept_features = full_corpus.extract_features(lambda doc: 1, features.intercept)
print full_corpus
In [9]:
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
polar_corpus = full_corpus.subset(rows=polar_indices)
print polar_corpus
In [10]:
# liwc_counts, liwc_categories = features.liwc([doc.document for doc in full_corpus.data])
liwc_features = polar_corpus.extract_features(lambda doc: doc.document, features.liwc)
ngrams_features = polar_corpus.extract_features(lambda doc: doc.document,
features.ngrams, ngram_max=2, min_df=2, max_df=1.0)
# ngram_max=2, min_df=0.001, max_df=0.95
all_features = np.concatenate([intercept_features, liwc_features, ngrams_features])
print polar_corpus
In [14]:
posemo_features = npx.bool_mask_to_indices(polar_corpus.feature_names == 'posemo')
negemo_features = npx.bool_mask_to_indices(polar_corpus.feature_names == 'negemo')
emo_features = npx.bool_mask_to_indices(
np.in1d(polar_corpus.feature_names, ['posemo', 'negemo']))
# liwc_features, all_features
print posemo_features, negemo_features, emo_features
Out[14]:
In [15]:
def corpus_mean_accuracy(corpus, penalty='l2', test_size=0.1, n_iter=10):
folds = cross_validation.StratifiedShuffleSplit(corpus.y, test_size=test_size, n_iter=n_iter)
accuracies = []
for fold_index, (train_indices, test_indices) in enumerate(folds):
train_corpus = corpus.subset(train_indices)
test_corpus = corpus.subset(test_indices)
model = linear_model.LogisticRegression(fit_intercept=False, penalty=penalty)
model.fit(train_corpus.X, train_corpus.y)
pred_y = model.predict(test_corpus.X)
accuracy = metrics.accuracy_score(test_corpus.y, pred_y)
accuracies += [accuracy]
return np.mean(accuracies)
In [18]:
feature_sets = [
('Unigrams, bigrams, LIWC', all_features),
('Unigrams, bigrams', ngrams_features),
('LIWC (all)', liwc_features),
('LIWC (posemo, negemo)', emo_features),
('LIWC (posemo)', posemo_features),
('LIWC (negemo)', negemo_features),
('Baseline', intercept_features),
]
print '{:s} & {:s} & {:s} \\\\'.format('Features', 'Number of features', 'Accuracy')
for name, selected_features in feature_sets:
subcorpus = polar_corpus.subset(features=selected_features)
accuracy = corpus_mean_accuracy(subcorpus)
print '{:s} & {:d} & {:.2%} \\\\'.format(
name, selected_features.size, accuracy).replace('%', '\\%')
In [55]:
posemo_corpus = polar_corpus.subset(features=emo_features)
Counter(posemo_corpus.y)
print 'majority_class = {:.2%}'.format(10842. / (10842. + 2785.))
# print posemo_corpus.X.toarray()
In [14]:
times = np.array([doc.published for doc in full_corpus.data]).astype('datetime64[s]')
model = linear_model.LogisticRegression(fit_intercept=False, penalty='l2')
model.fit(polar_corpus.X[:, ngrams_features], polar_corpus.y)
full_corpus_pred_y = model.predict(full_corpus.X[:, ngrams_features])
In [27]:
print times.size, full_corpus_pred_y.size
full_corpus_pred_labels = full_corpus.labels[full_corpus_pred_y]
values = full_corpus_pred_labels.reshape(-1, 1)
bin_edges, bin_values = timeseries.binned_timeseries(
times, values,
time_units_per_bin=7, time_unit='D', statistic='count')
print bin_edges
print bin_values.ravel()
In [30]:
bins = dict(total=bin_values.ravel())
for label in ['For', 'Against']:
indices = full_corpus_pred_y == full_corpus.class_lookup[label]
# by week
bin_edges, bin_values = timeseries.binned_timeseries(
times[indices], values[indices],
time_units_per_bin=7, time_unit='D', statistic='count')
bin_values = bin_values.ravel()
print bin_edges, bin_values
# bin_values = npx.exponential_decay(bin_values.ravel(), window=14, alpha=0.75)
# plt.plot(bin_edges, bin_values, label=label, **styles.next())
# datetime64_formatter = datetime_extra.datetime64_formatter
# axes = plt.gca()
# axes.xaxis.set_major_formatter(ticker.FuncFormatter(datetime_extra.datetime64_formatter))
# axes.grid(False)
In [ ]:
for liwc_category in ['posemo', 'negemo']:
plt.cla()
styles = distinct_styles()
counts = liwc_counts[:, liwc_categories.index(liwc_category)].toarray()
time_hist('Overall %s' % liwc_category, full_corpus_times, counts,
statistic='sum', **styles.next())
for label in ['For', 'Against']:
indices = full_pred_y == full_corpus.class_lookup[label]
time_hist('%s-class %s' % (label, liwc_category),
full_corpus_times[indices], counts[indices], statistic='sum', **styles.next())
plt.title('LIWC category: %s' % liwc_category)
plt.ylabel('Frequency')
plt.xlabel('Date')
axes = plt.gca()
axes.xaxis.set_major_formatter(ticker.FuncFormatter(datetime_extra.datetime64_formatter))
axes.grid(False)
plt.xlim(np.array(npx.bounds(full_corpus_times)).astype(float))
plt.gcf().set_size_inches(8, 5)
plt.legend(loc='best')
plt.savefig(figure_path('liwc-%s-for-vs-against.pdf' % liwc_category))
raise IPython.embed()
# convert vector to column matrix
values = full_pred_y.reshape((-1, 1))
plt.cla()
In [1]:
from lexicons import Liwc
In [5]:
lexicon = Liwc()
document = """I do live in OH, \& yes, it's awful. NO on \#Issue2 RT @RachelAnneLevy: I don't live in Ohio but from what i can tell \#sb5 sounds awful."""
In [16]:
counter = Counter(lexicon.read_document(document))
print counter
In [18]:
for category, count in counter.most_common(100):
print '%s\t%s' % (category, count)
In [11]:
for match in re.finditer(r"[a-z]['a-z]*", document, re.I):
token = match.group(0)
matches = [match for match in lexicon.read_token(token.lower())]
print '%s & %s' % (token, ' & '.join(matches))
In [ ]: