In [36]:
import itertools
import json

In [1]:
liwc_fd = open('/usr/local/data/liwc_2007.csv')

In [2]:
liwc_lines = liwc_fd.readlines()

In [28]:
liwc_lines[-3:]


Out[28]:
['bottom*\tspace,relativ\n',
 'revenge*\tanger,negemo,affect\n',
 'tast*\tingest,percept,bio\n']

In [30]:
def iter_matches(category):
    for line in liwc_lines:
        match, categories_csv = line.split('\t', 1)
        categories = categories_csv.split(',')
        if category in categories:
            yield match

In [38]:
categories = ['posemo', 'negemo']
for category in categories:
    print len(list(iter_matches(category))), category, 'matches'


406 posemo matches
495 negemo matches

Results

Commenting these out so that LIWC does not get pushed up to GitHub


In [43]:
liwc_json = {category: sorted(list(iter_matches(category))) for category in categories}
# print json.dumps(liwc_json)