Results
Here we reproduce Figures 1 & 3 and Table 3 from the AAAI 2015 paper entitled "Using Matched Samples to Estimate the Effects of Exercise on Mental Health from Twitter".
This notebook reads in the final mood classifications for users in three different groups:
We compare the differences in aggregate mood classifications between groups using a Wilcoxon signed-rank test.
Note: Since our annotated data is somewhat sensitive (e.g., linking Twitter accounts to mood), we have elected not to share a public link to the data. Please contact the authors to discuss possible data sharing agreements.
In [1]:
# Download and extract data (see note above).
import tarfile
import urllib
DATA_URL_PATH = 'http://tapi.cs.iit.edu/data/aaai-2015-matching/'
DATA_FILE = 'aaai-2015-matching-data.tgz'
DATA_URL = DATA_URL_PATH + DATA_FILE
print 'downloading %s' % (DATA_URL)
urllib.urlretrieve(DATA_URL, DATA_FILE)
print 'extracting %s' % (DATA_FILE)
tar = tarfile.open(DATA_FILE)
tar.extractall()
tar.close()
The main file containts classifier output for each user, their matched pair, and a random pair for each of the three classes.
In [2]:
!head -2 classifications.csv
The columns of this file are as follows:
u
: User id (excercise group)u_AH
: Proportion of tweets predicted as Anger/Hostilityu_DD
: Proportion of tweets predicted as Depression/Dejectionu_TA
: Proportion of tweets predicted as Tension/Anxietyu_avg
: Average proportion of AH/DD/TAm
: User id (matched group)m_AH
: Proportion of tweets predicted as Anger/Hostilitym_DD
: Proportion of tweets predicted as Depression/Dejectionm_TA
: Proportion of tweets predicted as Tension/Anxietym_avg
: Average proportion of AH/DD/TAdiff_AH
: (excercise - matched) proportion for AHdiff_DD
: (excercise - matched) proportion for DDdiff_TA
: (excercise - matched) proportion for TAdiff_avg
: (excercise - matched) proportion for avgr
: User id (random control)r_AH
: Proportion of tweets predicted as Anger/Hostilityr_DD
: Proportion of tweets predicted as Depression/Dejectionr_TA
: Proportion of tweets predicted as Tension/Anxietyr_avg
: Average proportion of AH/DD/TAA similar file was generated using a classifier trained on half the training data. The columns are the same (but there are no columns for a random control).
In [3]:
!head -2 half_classifications.csv
Finally, there are three files in stats/
containing profile information for each user in the exercise group (sport_users_stats
), matched control group (nosport_users_stats
) and random control group (random_users_stats
).
In [4]:
!head -2 stats/sport_users_stats
The columns for these files are:
id
: user idgender
: estimated gender (based on first name and Census data)city
: estimated city of origin, from the location fieldstate
: estimate state of origin, from the location fieldstatuses_count
: number of tweetsfollowers_count
: number of followersfriends_count
: number of friendsWith this data, we will perform hypothesis tests to measure the differences in estimate mood across groups.
In [5]:
# Read classifications.csv and half_classifications.csv
# Note that half_classifications includes rows without a random match, thus the counts differ.
from numpy import array as npa
def read_results(fname):
header = None
results = []
for line in open(fname, 'rt'):
parts = line.strip().split(',')
if not header:
header = parts
else:
results.append(npa([float(x) for x in parts]))
return header, npa(results)
fields_all, results_all = read_results('classifications.csv')
fields_half, results_half = read_results('half_classifications.csv')
print 'read %d results from classifications.csv' % (len(results_all))
print 'read %d results from half_classifications.csv' % (len(results_half))
In [9]:
# Generate boxplots of the mood predictions from classifications.csv (Figure 3 in the paper).
import matplotlib.pyplot as plt
import numpy as np
def get_labels(fields, random_users=False):
labels = []
for i, (label, pretty_label) in enumerate([('AH', 'Hostility'), ('DD', 'Dejection'),
('TA', 'Anxiety')]): #, ('avg', 'Average')]):
sporty_idx = fields.index('u_' + label)
nonsporty_idx = fields.index('m_' + label)
labels_tuple = [label, nonsporty_idx, sporty_idx, pretty_label]
if random_users:
random_idx = fields.index('r_' + label)
labels_tuple.append(random_idx) # (label, nonsporty_idx, sporty_idx, random_idx, pretty_label)
labels.append(labels_tuple)
return labels
def plot_results(labels, results, random_users=False):
f, axes = plt.subplots(1, 3, sharex=True, sharey=True, figsize=(5,3))
xticklabels = ['match', 'exercise']
if random_users:
xticklabels.append('random')
for i, label in enumerate(labels):
boxplots = [results[:, label[1]], results[:, label[2]]]
if random_users:
boxplots.append(results[:, label[4]])
axes[i].boxplot(boxplots, showfliers=False, widths=.7)
axes[i].set_ylabel('P(' + label[3] + ')', size=10)
axes[i].yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
alpha=0.5)
axes[i].set_xticklabels(xticklabels, rotation=90)
f.tight_layout()
f.show()
plt.savefig('classifications.pdf', bbox_inches='tight')
# classifier trained on all tweets
labels_all = get_labels(fields_all, random_users=False)
plot_results(labels_all, results_all, random_users=False)
In [10]:
# Perform wilcoxon signed-rank test of significance (Table 3 from paper).
import numpy as np
from scipy.stats import wilcoxon
def pct_reduction(before, after):
return 100. * (after - before) / before
def test_significance(labels, results, idx1=1, idx2=2, diff_legend='% Change (vs. match)'): # 1: nosport, 2: sport, 3: random
print '%10s\t%15s\t%10s' % ('Category', diff_legend, 'p-value')
for i, label in enumerate(labels):
match = results[:, label[idx1]]
exercise = results[:, label[idx2]]
wil = wilcoxon(match, exercise)
print '%10s\t%2.1f\t%10.2g' % (label[3],
pct_reduction(np.mean(match), np.mean(exercise)),
wil[1])
print 'all the labeled tweets'
labels_all = get_labels(fields_all, random_users=True)
test_significance(labels_all, results_all)
test_significance(labels_all, results_all, idx1=4, diff_legend='% Change (vs. random)')
print 'half of the labeled tweets'
labels_half = get_labels(fields_half)
test_significance(labels_half, results_half)
In [11]:
# Plot distribution of samples (Figure 1 from paper).
import csv
from collections import Counter
import random
random.seed(1234567)
def read_stats(fname):
rows = []
with open(fname, 'rb') as csvfile:
reader = csv.DictReader(csvfile, delimiter=',')
return [r for r in reader]
def get_col(stats, label, value):
if value:
counts = Counter([x[label] for x in stats])
return 1. * counts[value] / sum(counts.values())
else:
return [float(x[label]) for x in stats]
# Plot boxplots for each stat.
def plot_stats(nosport, sport, rnd):
f, axes = plt.subplots(2, 3, sharex=True, figsize=(9,5))
labels = [('statuses_count', '# Statuses', None),
('followers_count', '# Followers', None),
('friends_count', '# Friends', None),
('gender', '% Female', 'f'),
('state', '% from California', 'California'),
]
for i, (label, pretty_label, value) in enumerate(labels):
if i < 3:
j = 0
else:
j = 1
i = i % 3
data = [get_col(nosport, label, value),
get_col(sport, label, value),
get_col(rnd, label, value)]
if not value:
axes[j, i].boxplot(data, showfliers=False, widths=.7)
else:
print data
axes[j, i].bar(1 + np.arange(3), data, align='center', width=.7)
axes[j, i].set_ylabel(pretty_label, size=10)
axes[j, i].yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
alpha=0.5)
xticklabels = ['match', 'exercise', 'random']
axes[j, i].set_xticklabels(xticklabels, rotation=90)
axes[1, 2].axis('off')
f.tight_layout()
plt.savefig('matches.pdf', bbox_inches='tight')
random_stats = read_stats('stats/random_users_stats')
nosport_stats = read_stats('stats/nosport_users_stats')
sport_stats = read_stats('stats/sport_users_stats')
plot_stats(nosport_stats, sport_stats, random_stats)