In [37]:
import os
from tsa.lib import tabular, html
from collections import Counter
import itertools
# from datetime import datetime

In [16]:
label_keys = ['For', 'Against', 'Neutral', 'Broken Link', 'Not Applicable']

def read(filepath='%s/ohio/sb5-b.xlsx' % os.getenv('CORPORA', '.')):
    for row in tabular.read_xlsx(filepath):
        header = row['Tweet'] == 'Tweet' and row['Author'] == 'Author' and row['TweetID'] == 'TweetID'
        if not header:
            labels = [label_key for label_key in label_keys if bool(row[label_key])] + ['NA']
            label = labels[0]
            
            row['label'] = label
            row['document'] = html.unescape(row['Tweet'])
    
            yield row

In [17]:
original_table = list(read())
print 'Original spreadsheet: N=%d' % len(original_table)


106702

In [28]:
for_against = [row for row in original_table if row['label'] in ('For', 'Against')]
print 'Just For + Against: N=%d' % len(for_against)
for_against_counts = Counter([row['label'] for row in for_against])
print 'Against / Total {:.2%}'.format(for_against_counts['Against'] / float(len(for_against)))
for_against_counts


Just For + Against: N=13627
Against / Total 79.56%
Out[28]:
Counter({'Against': 10842, 'For': 2785})

In [58]:
for_against_manual = [row for row in for_against if row['Inferred'] != 1]
print 'Just non-Inferred For + Against: N=%d' % len(for_against_manual)
for_against_manual_counts = Counter([row['label'] for row in for_against_manual])
print 'Against / Total {:.4%}'.format(
  for_against_manual_counts['Against'] / float(len(for_against_manual)))
for_against_manual_counts


Just non-Inferred For + Against: N=4172
Against / Total 82.4545%
Out[58]:
Counter({'Against': 3440, 'For': 732})

In [22]:
Counter(row['Inferred'] for row in for_against)


Out[22]:
Counter({1.0: 9455, None: 3938, 0.0: 234})

In [36]:
1 - (3440 / 10842.), 1 - (732 / 2785.)


Out[36]:
(0.6827153661686036, 0.7371633752244164)

In [35]:
manual_table = [row for row in original_table if row['Inferred'] == 1]
print 'Inferences were only made for For / Against labels:', Counter([row['label'] for row in manual_table])


Inferences were only made for For / Against labels: Counter({'Against': 7402, 'For': 2053})

In [66]:
def author(row):
    return row['Author']

users = []
for author, rows in itertools.groupby(sorted(original_table, key=author), author):
    counts = Counter(row['label'] for row in rows if row['label'] in ('For', 'Against'))
    # most_common always returns a list of (label, count) tuples
    if len(counts) > 0:
        users.append(counts.most_common(1)[0][0])
user_counts = Counter(users)
print user_counts
print 'Against / Total {:.2%}'.format(user_counts['Against'] / float(user_counts['For'] + user_counts['Against']))


Counter({'Against': 993, 'For': 195})
Against / Total 83.59%

In [67]:
993+195


Out[67]:
1188