In [37]:
import os
from tsa.lib import tabular, html
from collections import Counter
import itertools
# from datetime import datetime
In [16]:
label_keys = ['For', 'Against', 'Neutral', 'Broken Link', 'Not Applicable']
def read(filepath='%s/ohio/sb5-b.xlsx' % os.getenv('CORPORA', '.')):
for row in tabular.read_xlsx(filepath):
header = row['Tweet'] == 'Tweet' and row['Author'] == 'Author' and row['TweetID'] == 'TweetID'
if not header:
labels = [label_key for label_key in label_keys if bool(row[label_key])] + ['NA']
label = labels[0]
row['label'] = label
row['document'] = html.unescape(row['Tweet'])
yield row
In [17]:
original_table = list(read())
print 'Original spreadsheet: N=%d' % len(original_table)
In [28]:
for_against = [row for row in original_table if row['label'] in ('For', 'Against')]
print 'Just For + Against: N=%d' % len(for_against)
for_against_counts = Counter([row['label'] for row in for_against])
print 'Against / Total {:.2%}'.format(for_against_counts['Against'] / float(len(for_against)))
for_against_counts
Out[28]:
In [58]:
for_against_manual = [row for row in for_against if row['Inferred'] != 1]
print 'Just non-Inferred For + Against: N=%d' % len(for_against_manual)
for_against_manual_counts = Counter([row['label'] for row in for_against_manual])
print 'Against / Total {:.4%}'.format(
for_against_manual_counts['Against'] / float(len(for_against_manual)))
for_against_manual_counts
Out[58]:
In [22]:
Counter(row['Inferred'] for row in for_against)
Out[22]:
In [36]:
1 - (3440 / 10842.), 1 - (732 / 2785.)
Out[36]:
In [35]:
manual_table = [row for row in original_table if row['Inferred'] == 1]
print 'Inferences were only made for For / Against labels:', Counter([row['label'] for row in manual_table])
In [66]:
def author(row):
return row['Author']
users = []
for author, rows in itertools.groupby(sorted(original_table, key=author), author):
counts = Counter(row['label'] for row in rows if row['label'] in ('For', 'Against'))
# most_common always returns a list of (label, count) tuples
if len(counts) > 0:
users.append(counts.most_common(1)[0][0])
user_counts = Counter(users)
print user_counts
print 'Against / Total {:.2%}'.format(user_counts['Against'] / float(user_counts['For'] + user_counts['Against']))
In [67]:
993+195
Out[67]: