Reproduce results of our AAAI paper.
This notebook assumes the data is in place. You can get data either by running data_collection.ipynb or by running the next cell, which downloads it.
In [24]:
# Download Twitter data from server if not already present.
import os
import urllib
for fname in ['username2brand.pkl', 'brand2counts.pkl', 'id2brand.pkl', 'brands.json']:
if not os.path.isfile('../data/' + fname):
url = 'http://tapi.cs.iit.edu/data/aaai-2015-demographics/originals/' + fname
print 'downloading %s to %s' % (url, '../data/' + fname)
urllib.urlretrieve(url, "../data/" + fname)
else:
print fname, 'already exists.'
In [8]:
# Unpickle everything
import pickle
id2brand = pickle.load(open('../data/id2brand.pkl', 'rb'))
brand2counts = pickle.load(open('../data/brand2counts.pkl', 'rb'))
username2brand = pickle.load(open('../data/username2brand.pkl', 'rb'))
In [9]:
import numpy as np
# Plot descriptive stats of the data.
import matplotlib.pyplot as plt
def plot_data_figs():
figure, axes = plt.subplots(2, 1, sharex=True)
unique_friends = sorted([len(d.keys()) for d in brand2counts.values()], reverse=True)
axes[0].plot(unique_friends)
axes[0].set_xscale('log')
axes[0].set_yscale('log')
axes[0].set_title('number of unique neighbors', size=16)
brcounts = sorted([sum(d.values()) for d in brand2counts.values()], reverse=True)
print 'total friend links:', sum(brcounts)
axes[1].plot(brcounts)
axes[1].set_xscale('log')
axes[1].set_yscale('log')
axes[1].set_title('number of neighbor links', size=16)
axes[1].set_xlim((0,1500))
axes[1].set_xlabel('rank', size=14)
axes[1].set_ylabel(' ' * 30 + 'count', size=14)
figure.tight_layout()
plt.savefig('data.pdf', bbox_inches='tight')
plot_data_figs()
In [25]:
# Normalize data and create sparse matrix.
import numpy as np
from numpy import array as npa
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import scale
brand_ids = npa(brand2counts.keys())
vec = DictVectorizer()
X = vec.fit_transform(brand2counts.itervalues())
print 'The feature vector for one brand looks like this:\n', X[0]
In [26]:
# Normalize by row.
from sklearn.preprocessing import normalize
print '%d total friend links' % X.sum()
X = normalize(X, norm='l1', axis=1)
print 'The normalized feature vector for one brand looks like this:\n', X[0]
In [12]:
# Do cross-fold validation for different demographics.
%pylab inline --no-import-all
from scipy.stats import pearsonr
from sklearn.cross_validation import KFold
from sklearn.feature_selection import f_regression
from sklearn.linear_model import ElasticNet, Lasso, MultiTaskElasticNet, MultiTaskElasticNetCV, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
feats = npa(vec.get_feature_names())
def plot_scatter(preds, truths, ylabels):
for yi, ylabel in enumerate(ylabels):
pr = [p[yi] for p in preds]
tr = [t[yi] for t in truths]
plt.figure()
plt.scatter(tr, pr)
plt.xlabel('truth')
plt.ylabel('pred')
corr = pearsonr(pr, tr)
plt.title('%s r=%.2f (%.2g)' % (ylabel, corr[0], corr[1]))
plt.show()
def print_top_feats(m, feature_names, labels, n=10):
for yi, ylabel in enumerate(labels):
print 'Top Coefficients for', ylabel
coef = m.coef_[yi]
srted = np.argsort(coef)
topi = srted[::-1][:n]
boti = srted[:n]
print 'pos:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[topi], coef[topi]))
print 'neg:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[boti], coef[boti]))
def get_yvalues(ylabels, demo):
return npa([float(demo[yl][:-1]) for yl in ylabels])
def get_correlations(preds, truths, ylabels):
results = []
for i, y in enumerate(ylabels):
pr = [p[i] for p in preds]
tr = [t[i] for t in truths]
results.append(pearsonr(pr, tr)[0])
return results
correlations = []
category_results = {}
outputs = {'Education': ['No College', 'College', 'Grad School'],
'Children': ['No Kids', 'Has Kids'],
'Income': ['$0-50k', '$50-100k', '$100-150k', '$150k+'],
'Gender': ['Male', 'Female'],
'Age': ['18-24', '25-34', '35-44', '45-54', '55-64', '65+'],
'Ethnicity': ['Caucasian', 'Hispanic', 'African American', 'Asian']}
def get_model():
# return Ridge(.1)
# return ElasticNet(alpha=1e-5, l1_ratio=0.5)
return MultiTaskElasticNet(alpha=1e-5, l1_ratio=0.5)
# Labels grouped together for use by MultiTaskElasticNet.
for category, ylabels in outputs.items():
indices = [i for i, bid in enumerate(brand_ids) if len(set(ylabels) & set(id2brand[bid]['demo'].keys())) == len(ylabels)]
print 'predicting', ylabels, 'for', len(indices), 'brands'
y = npa([get_yvalues(ylabels, id2brand[brand_ids[bid]]['demo']) for bid in indices])
thisX = X[indices].toarray()
cv = KFold(len(y), 5, shuffle=True, random_state=123456)
preds = []
truths = []
for train, test in cv:
m = get_model()
m.fit(thisX[train], y[train])
pred = m.predict(thisX[test])
preds.extend(pred)
truths.extend(y[test])
m = get_model()
m.fit(thisX, y)
category_results[category] = {'preds': preds, 'truths': truths, 'model': m}
plot_scatter(preds, truths, ylabels)
print_top_feats(m, feats, ylabels)
correlations.append(np.mean(get_correlations(preds, truths, ylabels)))
print 'average correlation=', np.mean(correlations)
In [13]:
# Plot scatters.
import math
from matplotlib import lines
def nrmsd(truths, preds):
""" Normalized root mean squared deviation. """
return rmsd(truths, preds) / (max(truths) - min(truths))
def rmsd(truths, preds):
""" Normalized root mean squared deviation. """
return math.sqrt(mean_squared_error(preds, truths))
def plot_scatter_subfig(axis, category, yidx):
results = category_results[category]
name = outputs[category][yidx]
preds = [p[yidx] for p in results['preds']]
truths = [p[yidx] for p in results['truths']]
fit = np.polyfit(truths, preds, 1)
fit_fn = np.poly1d(fit)
axis.plot(truths, preds, 'o', truths, fit_fn(truths), 'k', linewidth=1.5,
ms=2, markerfacecolor='None', markeredgecolor='b')
axis.set_title('%s\n$r=%.2f$' % (name, pearsonr(preds, truths)[0]), size=14)
axis.locator_params(nbins=4, tight=True)
mean = np.mean(truths)
start, end = axis.get_xlim()
def make_scatters_fig():
figure, axes = plt.subplots(3, 7, figsize=(15,8))
# Row 1
plot_scatter_subfig(axes[0][0], 'Education', 0)
plot_scatter_subfig(axes[0][1], 'Education', 1)
plot_scatter_subfig(axes[0][2], 'Education', 2)
plot_scatter_subfig(axes[0][3], 'Income', 0)
plot_scatter_subfig(axes[0][4], 'Income', 1)
plot_scatter_subfig(axes[0][5], 'Income', 2)
plot_scatter_subfig(axes[0][6], 'Income', 3)
# Row 2
for i in range(6):
plot_scatter_subfig(axes[1][i], 'Age', i)
# Row 3
for i in range(4):
plot_scatter_subfig(axes[2][i], 'Ethnicity', i)
plot_scatter_subfig(axes[2][4], 'Gender', 0)
plot_scatter_subfig(axes[2][5], 'Children', 0)
# Now add titles.
axes[1, 6].axis('off')
axes[2, 6].axis('off')
axes[0, 1].text(.5, 1.35, 'Education',
verticalalignment='bottom', horizontalalignment='center',
color='black', fontsize=18, weight='bold', transform=axes[0, 1].transAxes)
axes[0, 4].text(1.1, 1.35, 'Income',
verticalalignment='bottom', horizontalalignment='center',
color='black', fontsize=18, weight='bold', transform=axes[0, 4].transAxes)
axes[1, 2].text(1.1, 1.3, 'Age',
verticalalignment='bottom', horizontalalignment='center',
color='black', fontsize=18, weight='bold', transform=axes[1, 2].transAxes)
axes[2, 1].text(1.1, 1.32, 'Ethnicity',
verticalalignment='bottom', horizontalalignment='center',
color='black', fontsize=18, weight='bold', transform=axes[2, 1].transAxes)
axes[2, 4].text(.5, 1.32, 'Gender',
verticalalignment='bottom', horizontalalignment='center',
color='black', fontsize=18, weight='bold', transform=axes[2, 4].transAxes)
axes[2, 5].text(.5, 1.32, 'Family',
verticalalignment='bottom', horizontalalignment='center',
color='black', fontsize=18, weight='bold', transform=axes[2, 5].transAxes)
axes[1][0].set_ylabel('Predicted Value (%)', size=18)
plt.subplots_adjust(hspace=.7)
plt.figtext(0.5,.08,"True Value (%)",fontdict={'fontsize':18}, verticalalignment='top', horizontalalignment='center')
plt.savefig('scatters.pdf', bbox_inches='tight')
make_scatters_fig()
In [15]:
# Print the top features.
from collections import defaultdict
from twutil import collect
def get_top_user_ids():
id_list = []
top_user_ids = defaultdict(lambda: defaultdict(lambda: []))
for category in category_results:
results = category_results[category]
coef = results['model'].coef_
for yi, ylabel in enumerate(outputs[category]):
topi = np.argsort(coef[yi])[::-1][:5]
print category, ylabel, ' '.join('%d' % x for x in feats[topi])
id_list.extend(feats[topi])
top_user_ids[category][ylabel] = feats[topi]
return top_user_ids, id_list
def get_top_user_names():
top_user_ids, id_list = get_top_user_ids()
user_names = collect.lookup_handles(id_list)
id2user = dict([(int(x[1]), x[0]) for x in user_names])
for category in top_user_ids:
for label in top_user_ids[category]:
top_user_ids[category][label] = [id2user[x] for x in top_user_ids[category][label] if x in id2user]
return top_user_ids
top_users = get_top_user_names()
In [16]:
import re
def list2row(mylist, fmt='%s'):
return ' & '.join([fmt % i for i in mylist])
def verb(s, delim=';'):
return '\\verb' + delim + s + delim
def clean(s):
return re.sub('_', '\\_', re.sub('\$', '\\$', s))
def make_user_table(top_users):
outf = open('users.tex', 'wt')
outf.write('\\begin{table*}[t]\n\\centering\n\\begin{tabular}{|c|c|l|}\n\\hline\n')
outf.write(list2row(['{\\bf Category}', '{\\bf Value}', '{\\bf Top Accounts}']) +
'\\\\\n\\hline\n')
for ci, category in enumerate(outputs):
for li, label in enumerate(outputs[category]):
row = [''] * 3
row[0] = category if li == 0 else ''
row[1] = clean(label)
row[2] = ', '.join(clean(x) for x in top_users[category][label])
outf.write(list2row(row) + '\\\\\n')
outf.write('\\hline\n')
outf.write('\\end{tabular}\\caption{Accounts with the highest estimated coefficients for each category.\\label{tab.users}}\n\\end{table*}\n')
make_user_table(top_users)
In [17]:
!cat users.tex
Comparison with supervised learning (logistic regression)
We manually labeled individual Twitter users with race/gender to compare accuracy of the model trained above. For comparison, we also train a supervised logistic regression classifier, which uses the same feature vector as our model.
Because the labeled data contains personally identifiable information, we have elected not to share it publicly. Please contact the authors to discuss possible data sharing agreements.
In [20]:
# Compute accuracy on users labeled by race.
from collections import Counter
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.utils.extmath import safe_sparse_dot
def train_demo_model(ylabels):
indices = [i for i, bid in enumerate(brand_ids) if len(set(ylabels) & set(id2brand[bid]['demo'].keys())) == len(ylabels)]
print 'training race model on', len(indices), 'brands'
y = npa([get_yvalues(ylabels, id2brand[brand_ids[bid]]['demo']) for bid in indices])
thisX = X[indices].toarray()
m = get_model()
m.fit(thisX, scale(y))
m.coef_ = m.coef_[0:3]
return m
def map_race_label(label):
return ['white', 'latin', 'black', 'asian'].index(label)
def read_labeled_data(fname, label_map_f):
users = []
labels = []
friends = []
for line in open(fname):
parts = line.strip().split()
if len(parts) > 10:
users.append(parts[0])
labels.append(label_map_f(parts[1]))
friends.append(Counter([int(x) for x in parts[2:]]))
X_race = vec.transform(friends)
return users, npa(labels), X_race
def label_by_reg(X_race, m):
""" Scale coefficients per class to make them comparable;
then keep only positive coefficients. """
coef = m.coef_
coef = scale(m.coef_, axis=0) # Scale by class label
for i in range(len(coef)):
topi = np.where(coef[i] > 0)
topv = coef[i][topi]
coef[i] = [0] * len(coef[i])
coef[i][topi] = topv
pred = safe_sparse_dot(coef, X_race.T, dense_output=True).T
return np.argmax(pred, axis=1)
def label_by_clf(X_race, y_race, pct):
clf = LogisticRegression()
cv = KFold(len(y_race), 3, shuffle=True, random_state=123456)
preds = np.zeros(len(y_race), int)
for train, test in cv:
train = random.sample(train, int(len(train) * pct))
clf.fit(X_race[train], y_race[train])
preds[test] = clf.predict(X_race[test])
return preds
def eval_labeled(truth, pred, labels):
label_idx = np.arange(len(labels))
acc, f1 = (accuracy_score(pred, truth),
f1_score(truth, pred, labels=label_idx,
average='macro', pos_label=None))
print 'acc=', acc, 'f1=', f1
print confusion_matrix(truth, pred)
return f1
def do_race_expt():
labels = ['Caucasian', 'Hispanic', 'African American', 'Asian']
users_race, y_race, X_race = read_labeled_data('../data/race.txt', map_race_label)
X_race = X_race[np.where(y_race != 3)]
y_race = y_race[np.where(y_race != 3)]
print 'X_race shape=', str(X_race.get_shape()), 'total matches=', X_race.sum()
labels = labels[0:3]
reg = train_demo_model(labels)
pred_reg = label_by_reg(X_race, reg)
reg_f1 = eval_labeled(y_race, pred_reg, labels)
clf_f1s = []
for pct in [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]:
pred_clf = label_by_clf(X_race, y_race, pct)
clf_f1s.append(eval_labeled(y_race, pred_clf, labels))
return reg_f1, clf_f1s
race_results = do_race_expt()
In [21]:
# Compute accuracy on data labeled by gender.
def map_gender_label(label):
return ['Male', 'Female'].index(label)
def do_gender_expt():
labels = ['Male', 'Female']
users_gender, y_gender, X_gender = read_labeled_data('../data/gender.txt', map_gender_label)
print 'X_gender shape=', str(X_gender.get_shape()), 'total matches=', X_gender.sum()
reg = train_demo_model(labels)
pred_reg = label_by_reg(X_gender, reg)
reg_f1 = eval_labeled(y_gender, pred_reg, labels)
clf_f1s = []
for pct in [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]:
pred_clf = label_by_clf(X_gender, y_gender, pct)
clf_f1s.append(eval_labeled(y_gender, pred_clf, labels))
return reg_f1, clf_f1s
gender_results = do_gender_expt()
In [22]:
def plot_labeled_results(reg_results, clf_results, xticks, axis, title):
axis.plot(xticks, [reg_results] * len(clf_results), 'g--', label='regression', lw=3)
axis.plot(xticks, clf_results, 'bo-', label='classification')
axis.set_title(title, size=16)
def make_labeled_plot(gender_results, race_results):
xticks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
figure, axes = plt.subplots(2, 1, sharex=True)
plot_labeled_results(gender_results[0], gender_results[1], xticks, axes[0], 'Gender')
plot_labeled_results(race_results[0], race_results[1], xticks, axes[1], 'Ethnicity')
axes[1].set_ylabel((' ' * 25) + 'Macro F1', size=16)
axes[1].set_xlabel('% of labeled training data', size=16)
axes[1].legend(loc='lower right')
plt.savefig('labeled.pdf', bbox_inches='tight')
make_labeled_plot(gender_results, race_results)
In [23]:
# Plot F1 as the number of friends per user increases.
import random
def sample_friends(X, n):
X_sample = X.copy()
for i, xi in enumerate(X_sample):
nnz = xi.getnnz()
if n < nnz:
nzcols = xi.nonzero()[1]
indices = random.sample(range(nnz), nnz - n)
X_sample[i, nzcols[indices]] = 0.
X_sample.eliminate_zeros()
return X_sample
def _do_nfriends_expt(XX, y, m, labels):
ys = []
stderrs = []
xs = [1,2,3,4,5,10,20,30,40,50] # range(1, 50)[::5] # [::50]
for nfriends in xs:
f1s = []
for sample in range(5):
X_sample = sample_friends(XX, nfriends)
pred_reg = label_by_reg(X_sample, m)
reg_f1 = eval_labeled(y, pred_reg, labels)
f1s.append(reg_f1)
ys.append(np.mean(f1s))
stderrs.append(np.std(f1s) / math.sqrt(len(f1s)))
return npa(xs), npa(ys), npa(stderrs)
def do_nfriends_expt():
random.seed(1234)
labels = ['Caucasian', 'Hispanic', 'African American', 'Asian']
users_race, y_race, X_race = read_labeled_data('../data/race.txt', map_race_label)
labels = labels[:3]
m = train_demo_model(labels)
xs_r, ys_r, stderrs_r = _do_nfriends_expt(X_race, y_race, m, labels)
labels = ['Male', 'Female']
users_race, y_gender, X_gender = read_labeled_data('../data/gender.txt', map_gender_label)
m = train_demo_model(labels)
xs_g, ys_g, stderrs_g = _do_nfriends_expt(X_gender, y_gender, m, labels)
figure, axes = plt.subplots(2, 1, sharex=True)
axes[0].plot(xs_g, ys_g, 'bo-', ms=3)
axes[0].fill_between(xs_g, ys_g - stderrs_g, ys_g + stderrs_g, alpha=0.4, facecolor='b')
axes[0].set_title('Gender', size=16)
axes[1].plot(xs_r, ys_r, 'bo-', ms=3)
axes[1].fill_between(xs_r, ys_r - stderrs_r, ys_r + stderrs_r, alpha=0.4, facecolor='b')
axes[1].set_title('Ethnicity', size=16)
axes[1].set_xlabel('# of friends per user', size=16)
axes[1].set_ylabel((' ' * 25) + 'Macro F1', size=16)
axes[1].legend(loc='lower right')
plt.savefig('friends.pdf', bbox_inches='tight')
do_nfriends_expt()