Data Processing

Reproduce results of our AAAI paper.

This notebook assumes the data is in place. You can get data either by running data_collection.ipynb or by running the next cell, which downloads it.



In [24]:

    
# Download Twitter data from server if not already present.
import os
import urllib

for fname in ['username2brand.pkl', 'brand2counts.pkl', 'id2brand.pkl', 'brands.json']:
    if not os.path.isfile('../data/' + fname):
        url = 'http://tapi.cs.iit.edu/data/aaai-2015-demographics/originals/' + fname
        print 'downloading %s to %s' % (url, '../data/' + fname)
        urllib.urlretrieve(url, "../data/" + fname)
    else:
        print fname, 'already exists.'









    



downloading http://tapi.cs.iit.edu/data/aaai-2015-demographics/originals/username2brand.pkl to ../data/username2brand.pkl
downloading http://tapi.cs.iit.edu/data/aaai-2015-demographics/originals/brand2counts.pkl to ../data/brand2counts.pkl
downloading http://tapi.cs.iit.edu/data/aaai-2015-demographics/originals/id2brand.pkl to ../data/id2brand.pkl
brands.json already exists.



In [8]:

    
# Unpickle everything
import pickle
id2brand = pickle.load(open('../data/id2brand.pkl', 'rb'))
brand2counts = pickle.load(open('../data/brand2counts.pkl', 'rb'))
username2brand = pickle.load(open('../data/username2brand.pkl', 'rb'))



In [9]:

    
import numpy as np

# Plot descriptive stats of the data.
import matplotlib.pyplot as plt

def plot_data_figs():
    figure, axes = plt.subplots(2, 1, sharex=True)
    unique_friends = sorted([len(d.keys()) for d in brand2counts.values()], reverse=True)
    axes[0].plot(unique_friends)
    axes[0].set_xscale('log')
    axes[0].set_yscale('log')
    axes[0].set_title('number of unique neighbors', size=16)

    brcounts = sorted([sum(d.values()) for d in brand2counts.values()], reverse=True)
    print 'total friend links:', sum(brcounts)
    axes[1].plot(brcounts)
    axes[1].set_xscale('log')
    axes[1].set_yscale('log')
    axes[1].set_title('number of neighbor links', size=16)
    axes[1].set_xlim((0,1500))
    axes[1].set_xlabel('rank', size=14)
    axes[1].set_ylabel(' ' * 30 + 'count', size=14)
    figure.tight_layout()
    plt.savefig('data.pdf', bbox_inches='tight')

plot_data_figs()









    



total friend links: 177997246.0



In [25]:

    
# Normalize data and create sparse matrix.
import numpy as np
from numpy import array as npa
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import scale

brand_ids = npa(brand2counts.keys())
vec = DictVectorizer()
X = vec.fit_transform(brand2counts.itervalues())
print 'The feature vector for one brand looks like this:\n', X[0]









    



The feature vector for one brand looks like this:
  (0, 24490)	2.0
  (0, 1261)	1.0
  (0, 39330)	2.0
  (0, 17540)	1.0
  (0, 1454)	2.0
  (0, 0)	20.0
  (0, 1)	15.0
  (0, 15092)	1.0
  (0, 19418)	7.0
  (0, 24169)	2.0
  (0, 4742)	1.0
  (0, 2)	18.0
  (0, 12166)	15.0
  (0, 29038)	4.0
  (0, 12099)	3.0
  (0, 7089)	2.0
  (0, 5814)	7.0
  (0, 21934)	13.0
  (0, 1341)	2.0
  (0, 11576)	1.0
  (0, 25148)	5.0
  (0, 44454)	4.0
  (0, 42778)	2.0
  (0, 39996)	1.0
  (0, 44923)	1.0
  :	:
  (0, 36655)	1.0
  (0, 420)	9.0
  (0, 14510)	2.0
  (0, 4952)	1.0
  (0, 45103)	5.0
  (0, 18661)	3.0
  (0, 30144)	1.0
  (0, 26676)	1.0
  (0, 37699)	3.0
  (0, 35574)	3.0
  (0, 44969)	3.0
  (0, 46640)	2.0
  (0, 19989)	1.0
  (0, 11945)	3.0
  (0, 41177)	1.0
  (0, 24168)	1.0
  (0, 32547)	1.0
  (0, 27130)	2.0
  (0, 43971)	1.0
  (0, 38920)	10.0
  (0, 30904)	1.0
  (0, 364)	1.0
  (0, 10442)	1.0
  (0, 34295)	1.0
  (0, 19371)	3.0



In [26]:

    
# Normalize by row.
from sklearn.preprocessing import normalize
print '%d total friend links' % X.sum()
X = normalize(X, norm='l1', axis=1)
print 'The normalized feature vector for one brand looks like this:\n', X[0]









    



177997246 total friend links
The normalized feature vector for one brand looks like this:
  (0, 24490)	1.56586416128e-05
  (0, 1261)	7.82932080642e-06
  (0, 39330)	1.56586416128e-05
  (0, 17540)	7.82932080642e-06
  (0, 1454)	1.56586416128e-05
  (0, 0)	0.000156586416128
  (0, 1)	0.000117439812096
  (0, 15092)	7.82932080642e-06
  (0, 19418)	5.48052456449e-05
  (0, 24169)	1.56586416128e-05
  (0, 4742)	7.82932080642e-06
  (0, 2)	0.000140927774516
  (0, 12166)	0.000117439812096
  (0, 29038)	3.13172832257e-05
  (0, 12099)	2.34879624193e-05
  (0, 7089)	1.56586416128e-05
  (0, 5814)	5.48052456449e-05
  (0, 21934)	0.000101781170483
  (0, 1341)	1.56586416128e-05
  (0, 11576)	7.82932080642e-06
  (0, 25148)	3.91466040321e-05
  (0, 44454)	3.13172832257e-05
  (0, 42778)	1.56586416128e-05
  (0, 39996)	7.82932080642e-06
  (0, 44923)	7.82932080642e-06
  :	:
  (0, 36655)	7.82932080642e-06
  (0, 420)	7.04638872578e-05
  (0, 14510)	1.56586416128e-05
  (0, 4952)	7.82932080642e-06
  (0, 45103)	3.91466040321e-05
  (0, 18661)	2.34879624193e-05
  (0, 30144)	7.82932080642e-06
  (0, 26676)	7.82932080642e-06
  (0, 37699)	2.34879624193e-05
  (0, 35574)	2.34879624193e-05
  (0, 44969)	2.34879624193e-05
  (0, 46640)	1.56586416128e-05
  (0, 19989)	7.82932080642e-06
  (0, 11945)	2.34879624193e-05
  (0, 41177)	7.82932080642e-06
  (0, 24168)	7.82932080642e-06
  (0, 32547)	7.82932080642e-06
  (0, 27130)	1.56586416128e-05
  (0, 43971)	7.82932080642e-06
  (0, 38920)	7.82932080642e-05
  (0, 30904)	7.82932080642e-06
  (0, 364)	7.82932080642e-06
  (0, 10442)	7.82932080642e-06
  (0, 34295)	7.82932080642e-06
  (0, 19371)	2.34879624193e-05



In [12]:

    
# Do cross-fold validation for different demographics.
%pylab inline --no-import-all
from scipy.stats import pearsonr
from sklearn.cross_validation import KFold
from sklearn.feature_selection import f_regression
from sklearn.linear_model import ElasticNet, Lasso, MultiTaskElasticNet, MultiTaskElasticNetCV, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error

feats = npa(vec.get_feature_names())

def plot_scatter(preds, truths, ylabels):
    for yi, ylabel in enumerate(ylabels):
        pr = [p[yi] for p in preds]
        tr = [t[yi] for t in truths]
        plt.figure()
        plt.scatter(tr, pr)
        plt.xlabel('truth')
        plt.ylabel('pred')
        corr = pearsonr(pr, tr)
        plt.title('%s r=%.2f (%.2g)' % (ylabel, corr[0], corr[1]))
        plt.show()
    
def print_top_feats(m, feature_names, labels, n=10):
    for yi, ylabel in enumerate(labels):
        print 'Top Coefficients for', ylabel
        coef = m.coef_[yi]
        srted = np.argsort(coef)
        topi = srted[::-1][:n]
        boti = srted[:n]
        print 'pos:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[topi], coef[topi]))
        print 'neg:' + ' '.join('%s (%.2g)' % (n, c) for n, c in zip(feature_names[boti], coef[boti]))

def get_yvalues(ylabels, demo):
    return npa([float(demo[yl][:-1]) for yl in ylabels])

def get_correlations(preds, truths, ylabels):
    results = []
    for i, y in enumerate(ylabels):
        pr = [p[i] for p in preds]
        tr = [t[i] for t in truths]
        results.append(pearsonr(pr, tr)[0])
    return results

correlations = []
category_results = {}
outputs = {'Education': ['No College', 'College', 'Grad School'],
           'Children': ['No Kids', 'Has Kids'],
           'Income': ['$0-50k', '$50-100k', '$100-150k', '$150k+'],
           'Gender': ['Male', 'Female'],
           'Age': ['18-24', '25-34', '35-44', '45-54', '55-64', '65+'],
           'Ethnicity': ['Caucasian', 'Hispanic', 'African American', 'Asian']}

def get_model():
    # return Ridge(.1)
    # return ElasticNet(alpha=1e-5, l1_ratio=0.5)
    return MultiTaskElasticNet(alpha=1e-5, l1_ratio=0.5)
    
# Labels grouped together for use by MultiTaskElasticNet.
for category, ylabels in outputs.items():
    indices = [i for i, bid in enumerate(brand_ids) if len(set(ylabels) & set(id2brand[bid]['demo'].keys())) == len(ylabels)]
    print 'predicting', ylabels, 'for', len(indices), 'brands'
    y = npa([get_yvalues(ylabels, id2brand[brand_ids[bid]]['demo']) for bid in indices])
    thisX = X[indices].toarray()
    cv = KFold(len(y), 5, shuffle=True, random_state=123456)
    preds = []
    truths = []
    for train, test in cv:
        m = get_model()
        m.fit(thisX[train], y[train])
        pred = m.predict(thisX[test])
        preds.extend(pred)
        truths.extend(y[test])
    m = get_model()
    m.fit(thisX, y)
    category_results[category] = {'preds': preds, 'truths': truths, 'model': m}
    plot_scatter(preds, truths, ylabels)
    print_top_feats(m, feats, ylabels)
    correlations.append(np.mean(get_correlations(preds, truths, ylabels)))
print 'average correlation=', np.mean(correlations)









    



Populating the interactive namespace from numpy and matplotlib
predicting ['Male', 'Female'] for 1066 brands






    












    












    



Top Coefficients for Male
pos:51263592 (3.6e+02) 26257166 (3e+02) 2557521 (2.8e+02) 1344951 (2.6e+02) 28870086 (2.5e+02) 10671602 (2.4e+02) 32765534 (2.3e+02) 36362259 (2.2e+02) 18927441 (2.2e+02) 19426551 (2.1e+02)
neg:15846407 (-6.3e+02) 19397785 (-5.5e+02) 21324258 (-4e+02) 106837463 (-3.7e+02) 20710809 (-3.5e+02) 15131310 (-3.4e+02) 11522502 (-3.2e+02) 38531995 (-3.2e+02) 25589776 (-3.1e+02) 25087685 (-3e+02)
Top Coefficients for Female
pos:15846407 (6.3e+02) 19397785 (5.5e+02) 21324258 (4e+02) 106837463 (3.7e+02) 20710809 (3.5e+02) 15131310 (3.4e+02) 11522502 (3.2e+02) 38531995 (3.2e+02) 25589776 (3.1e+02) 25087685 (3e+02)
neg:51263592 (-3.6e+02) 26257166 (-3e+02) 2557521 (-2.8e+02) 1344951 (-2.6e+02) 28870086 (-2.5e+02) 10671602 (-2.4e+02) 32765534 (-2.3e+02) 36362259 (-2.2e+02) 18927441 (-2.2e+02) 19426551 (-2.1e+02)
predicting ['18-24', '25-34', '35-44', '45-54', '55-64', '65+'] for 1072 brands






    












    












    












    












    












    












    



Top Coefficients for 18-24
pos:10671602 (59) 18927441 (57) 29758446 (57) 14922225 (49) 36803580 (48) 24742040 (48) 15234657 (45) 39538010 (41) 10228272 (41) 7157132 (40)
neg:428333 (-78) 51241574 (-68) 1367531 (-68) 3108351 (-61) 30313925 (-58) 2467791 (-54) 28785486 (-52) 759251 (-50) 14173315 (-49) 15012486 (-49)
Top Coefficients for 25-34
pos:6480682 (74) 22027186 (67) 31080039 (60) 23544596 (59) 1344951 (53) 14089195 (51) 169686021 (49) 972651 (48) 16303106 (48) 30364057 (46)
neg:1367531 (-1.2e+02) 428333 (-1e+02) 28785486 (-77) 15012486 (-72) 759251 (-72) 51241574 (-70) 30313925 (-63) 14173315 (-63) 6017542 (-59) 15754281 (-59)
Top Coefficients for 35-44
pos:16331010 (26) 19397785 (20) 813286 (18) 15224867 (17) 428333 (17) 23832022 (16) 19697415 (16) 248900032 (16) 23151437 (16) 58598187 (16)
neg:10671602 (-32) 36803580 (-31) 29758446 (-31) 18927441 (-30) 5162861 (-30) 14922225 (-30) 24742040 (-27) 10228272 (-27) 15234657 (-25) 7157132 (-24)
Top Coefficients for 45-54
pos:428333 (83) 1367531 (80) 51241574 (61) 759251 (59) 28785486 (57) 15012486 (53) 3108351 (52) 30313925 (51) 14173315 (49) 14920785 (47)
neg:18927441 (-46) 10671602 (-45) 29758446 (-41) 6480682 (-39) 169686021 (-38) 22027186 (-38) 24742040 (-35) 14922225 (-35) 15234657 (-35) 36803580 (-33)
Top Coefficients for 55-64
pos:1367531 (83) 428333 (74) 51241574 (66) 3108351 (59) 30313925 (58) 28785486 (57) 759251 (54) 15012486 (54) 2467791 (52) 14173315 (50)
neg:18927441 (-44) 10671602 (-43) 29758446 (-40) 169686021 (-39) 24742040 (-37) 6480682 (-36) 27195114 (-36) 15234657 (-35) 14922225 (-34) 23561980 (-34)
Top Coefficients for 65+
pos:1367531 (47) 428333 (36) 3108351 (35) 51241574 (34) 14669951 (29) 28785486 (29) 30313925 (28) 2467791 (28) 759251 (28) 15012486 (28)
neg:169686021 (-21) 6480682 (-20) 18927441 (-19) 27195114 (-18) 22027186 (-18) 10671602 (-17) 90420314 (-17) 23151437 (-17) 1344951 (-17) 23561980 (-16)
predicting ['$0-50k', '$50-100k', '$100-150k', '$150k+'] for 1043 brands






    












    












    












    












    



Top Coefficients for $0-50k
pos:10228272 (1.3e+02) 10671602 (1.2e+02) 18927441 (1.1e+02) 29758446 (1.1e+02) 23151437 (93) 24742040 (91) 14922225 (91) 27195114 (89) 15234657 (88) 36803580 (87)
neg:3108351 (-1.8e+02) 807095 (-1.3e+02) 5988062 (-1.2e+02) 2467791 (-1.2e+02) 91478624 (-1.1e+02) 51241574 (-1.1e+02) 34713362 (-1e+02) 51263592 (-98) 428333 (-95) 2557521 (-90)
Top Coefficients for $50-100k
pos:428333 (56) 2557521 (46) 26257166 (44) 51241574 (44) 3108351 (43) 120943272 (43) 15485441 (43) 51263592 (40) 21324258 (40) 807095 (40)
neg:10671602 (-55) 18927441 (-52) 29758446 (-52) 10228272 (-47) 14922225 (-45) 24742040 (-43) 15234657 (-43) 36803580 (-42) 27195114 (-40) 7157132 (-37)
Top Coefficients for $100-150k
pos:3108351 (67) 5988062 (49) 807095 (48) 2467791 (43) 91478624 (43) 34713362 (39) 51241574 (36) 51263592 (36) 14800270 (36) 816653 (34)
neg:10228272 (-45) 10671602 (-37) 18927441 (-34) 29758446 (-33) 23151437 (-32) 27195114 (-29) 24742040 (-28) 21447363 (-28) 14922225 (-28) 180505807 (-27)
Top Coefficients for $150k+
pos:3108351 (68) 5988062 (53) 807095 (48) 91478624 (45) 34713362 (44) 14677919 (41) 2467791 (40) 14800270 (40) 816653 (36) 2735591 (34)
neg:10228272 (-37) 15846407 (-34) 23151437 (-29) 180505807 (-28) 21447363 (-26) 10671602 (-25) 17919972 (-24) 18927441 (-23) 29758446 (-22) 28706024 (-22)
predicting ['No College', 'College', 'Grad School'] for 1046 brands






    












    












    












    



Top Coefficients for No College
pos:10228272 (1.2e+02) 10671602 (75) 29758446 (68) 21447363 (68) 23151437 (64) 24742040 (63) 18927441 (60) 180505807 (57) 17919972 (56) 15234657 (55)
neg:14677919 (-1.1e+02) 16303106 (-1e+02) 5988062 (-95) 807095 (-91) 3108351 (-89) 16017475 (-85) 2735591 (-81) 1344951 (-81) 972651 (-80) 158414847 (-75)
Top Coefficients for College
pos:115485051 (35) 22027186 (26) 25521487 (25) 6480682 (25) 1344951 (23) 36686415 (23) 15485441 (22) 66561957 (22) 972651 (20) 90420314 (20)
neg:2467791 (-43) 30313925 (-38) 807095 (-29) 813286 (-29) 3108351 (-27) 500704345 (-27) 9300262 (-26) 759251 (-25) 17006157 (-23) 51241574 (-22)
Top Coefficients for Grad School
pos:14677919 (1.3e+02) 807095 (1.2e+02) 5988062 (1.2e+02) 3108351 (1.2e+02) 2467791 (1.1e+02) 16017475 (1e+02) 16303106 (90) 5392522 (89) 1339835893 (88) 15164565 (85)
neg:10228272 (-1.1e+02) 21447363 (-74) 10671602 (-68) 180505807 (-66) 23151437 (-65) 29758446 (-59) 18927441 (-59) 24742040 (-58) 17919972 (-57) 52551600 (-57)
predicting ['No Kids', 'Has Kids'] for 1051 brands






    












    












    



Top Coefficients for No Kids
pos:14677919 (1.4e+02) 16303106 (97) 807095 (96) 5988062 (95) 1344951 (88) 22027186 (84) 14075928 (81) 158414847 (81) 16129920 (80) 15164565 (79)
neg:29730065 (-1.5e+02) 18784113 (-1.5e+02) 15846407 (-1.3e+02) 14709355 (-1.2e+02) 16581734 (-1.2e+02) 94919897 (-1.1e+02) 16492009 (-1.1e+02) 106837463 (-1.1e+02) 16310534 (-1.1e+02) 36782022 (-1.1e+02)
Top Coefficients for Has Kids
pos:29730065 (1.5e+02) 18784113 (1.5e+02) 15846407 (1.3e+02) 14709355 (1.2e+02) 16581734 (1.2e+02) 94919897 (1.1e+02) 16492009 (1.1e+02) 106837463 (1.1e+02) 16310534 (1.1e+02) 36782022 (1.1e+02)
neg:14677919 (-1.4e+02) 16303106 (-97) 807095 (-96) 5988062 (-95) 1344951 (-88) 22027186 (-84) 14075928 (-81) 158414847 (-81) 16129920 (-80) 15164565 (-79)
predicting ['Caucasian', 'Hispanic', 'African American', 'Asian'] for 1035 brands






    












    












    












    












    



Top Coefficients for Caucasian
pos:1367531 (2.8e+02) 15485441 (2.7e+02) 15846407 (2.7e+02) 14920785 (2.6e+02) 428333 (2.4e+02) 25521487 (2.1e+02) 14075928 (2e+02) 51241574 (2e+02) 16303106 (1.8e+02) 34738598 (1.7e+02)
neg:27195114 (-2.8e+02) 23561980 (-2.6e+02) 18220175 (-2.4e+02) 23151437 (-2.4e+02) 338084918 (-2.4e+02) 117778179 (-2.3e+02) 19028953 (-2.3e+02) 17169320 (-2.2e+02) 25110374 (-2.2e+02) 17929027 (-2.2e+02)
Top Coefficients for Hispanic
pos:16664681 (83) 20346956 (77) 36511031 (74) 1059194370 (69) 14093707 (68) 16374678 (68) 23043294 (66) 18132494 (59) 10252962 (59) 17379685 (59)
neg:1367531 (-1e+02) 14920785 (-99) 15485441 (-96) 428333 (-95) 15846407 (-90) 51241574 (-75) 34738598 (-65) 120943272 (-62) 7744592 (-62) 17074714 (-61)
Top Coefficients for African American
pos:23151437 (2.3e+02) 27195114 (2.1e+02) 18220175 (2e+02) 117778179 (2e+02) 23561980 (2e+02) 25110374 (2e+02) 17169320 (1.9e+02) 19028953 (1.9e+02) 17929027 (1.9e+02) 84358766 (1.9e+02)
neg:15485441 (-1.2e+02) 115485051 (-1.2e+02) 14075928 (-1.2e+02) 15846407 (-1.2e+02) 25521487 (-1.2e+02) 16303106 (-1.1e+02) 1367531 (-1.1e+02) 14920785 (-1.1e+02) 90420314 (-1e+02) 14824849 (-95)
Top Coefficients for Asian
pos:36511031 (50) 14093707 (48) 816653 (47) 1344951 (43) 18132494 (39) 30068744 (37) 16664681 (35) 16562949 (32) 18993395 (31) 20346956 (31)
neg:1367531 (-57) 15846407 (-55) 14920785 (-53) 428333 (-49) 15485441 (-49) 34738598 (-36) 51241574 (-35) 7744592 (-33) 25521487 (-32) 28785486 (-32)
average correlation= 0.772360416411



In [13]:

    
# Plot scatters.
import math
from matplotlib import lines

def nrmsd(truths, preds):
    """ Normalized root mean squared deviation. """
    return rmsd(truths, preds) / (max(truths) - min(truths))

def rmsd(truths, preds):
    """ Normalized root mean squared deviation. """
    return math.sqrt(mean_squared_error(preds, truths))

def plot_scatter_subfig(axis, category, yidx):
    results = category_results[category]
    name = outputs[category][yidx]
    preds = [p[yidx] for p in results['preds']]
    truths = [p[yidx] for p in results['truths']]

    fit = np.polyfit(truths, preds, 1)
    fit_fn = np.poly1d(fit)
    axis.plot(truths, preds, 'o', truths, fit_fn(truths), 'k', linewidth=1.5,
              ms=2, markerfacecolor='None', markeredgecolor='b')
    axis.set_title('%s\n$r=%.2f$' % (name, pearsonr(preds, truths)[0]), size=14)
    axis.locator_params(nbins=4, tight=True) 
    mean = np.mean(truths)
    start, end = axis.get_xlim()
 
def make_scatters_fig():
    figure, axes = plt.subplots(3, 7, figsize=(15,8))
        # Row 1
    plot_scatter_subfig(axes[0][0], 'Education', 0)
    plot_scatter_subfig(axes[0][1], 'Education', 1)
    plot_scatter_subfig(axes[0][2], 'Education', 2)
    plot_scatter_subfig(axes[0][3], 'Income', 0)
    plot_scatter_subfig(axes[0][4], 'Income', 1)
    plot_scatter_subfig(axes[0][5], 'Income', 2)
    plot_scatter_subfig(axes[0][6], 'Income', 3)
    # Row 2
    for i in range(6):
        plot_scatter_subfig(axes[1][i], 'Age', i)        
    # Row 3
    for i in range(4):
        plot_scatter_subfig(axes[2][i], 'Ethnicity', i)        
    plot_scatter_subfig(axes[2][4], 'Gender', 0)        
    plot_scatter_subfig(axes[2][5], 'Children', 0)    
    # Now add titles.
    axes[1, 6].axis('off')
    axes[2, 6].axis('off')
    axes[0, 1].text(.5, 1.35, 'Education',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[0, 1].transAxes)
    axes[0, 4].text(1.1, 1.35, 'Income',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[0, 4].transAxes)
    axes[1, 2].text(1.1, 1.3, 'Age',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[1, 2].transAxes)
    axes[2, 1].text(1.1, 1.32, 'Ethnicity',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[2, 1].transAxes)
    axes[2, 4].text(.5, 1.32, 'Gender',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[2, 4].transAxes)
    axes[2, 5].text(.5, 1.32, 'Family',
                    verticalalignment='bottom', horizontalalignment='center',
                    color='black', fontsize=18, weight='bold', transform=axes[2, 5].transAxes)

    axes[1][0].set_ylabel('Predicted Value (%)', size=18)
    plt.subplots_adjust(hspace=.7)   
    plt.figtext(0.5,.08,"True Value (%)",fontdict={'fontsize':18}, verticalalignment='top', horizontalalignment='center')
    plt.savefig('scatters.pdf', bbox_inches='tight')

make_scatters_fig()



In [15]:

    
# Print the top features.
from collections import defaultdict
from twutil import collect

def get_top_user_ids():
    id_list = []
    top_user_ids = defaultdict(lambda: defaultdict(lambda: []))
    for category in category_results:
        results = category_results[category]
        coef = results['model'].coef_
        for yi, ylabel in enumerate(outputs[category]):
            topi = np.argsort(coef[yi])[::-1][:5]
            print category, ylabel, ' '.join('%d' % x for x in feats[topi])
            id_list.extend(feats[topi])
            top_user_ids[category][ylabel] = feats[topi]
    return top_user_ids, id_list

def get_top_user_names():
    top_user_ids, id_list = get_top_user_ids()
    user_names = collect.lookup_handles(id_list)
    id2user = dict([(int(x[1]), x[0]) for x in user_names])
    for category in top_user_ids:
        for label in top_user_ids[category]:
            top_user_ids[category][label] = [id2user[x] for x in top_user_ids[category][label] if x in id2user]
    return top_user_ids

top_users = get_top_user_names()









    



Gender Male 51263592 26257166 2557521 1344951 28870086
Gender Female 15846407 19397785 21324258 106837463 20710809
Age 18-24 10671602 18927441 29758446 14922225 36803580
Age 25-34 6480682 22027186 31080039 23544596 1344951
Age 35-44 16331010 19397785 813286 15224867 428333
Age 45-54 428333 1367531 51241574 759251 28785486
Age 55-64 1367531 428333 51241574 3108351 30313925
Age 65+ 1367531 428333 3108351 51241574 14669951
Income $0-50k 10228272 10671602 18927441 29758446 23151437
Income $50-100k 428333 2557521 26257166 51241574 3108351
Income $100-150k 3108351 5988062 807095 2467791 91478624
Income $150k+ 3108351 5988062 807095 91478624 34713362
Education No College 10228272 10671602 29758446 21447363 23151437
Education College 115485051 22027186 25521487 6480682 1344951
Education Grad School 14677919 807095 5988062 3108351 2467791
Children No Kids 14677919 16303106 807095 5988062 1344951
Children Has Kids 29730065 18784113 15846407 14709355 16581734
Ethnicity Caucasian 1367531 15485441 15846407 14920785 428333
Ethnicity Hispanic 16664681 20346956 36511031 1059194370 14093707
Ethnicity African American 23151437 27195114 18220175 117778179 23561980
Ethnicity Asian 36511031 14093707 816653 1344951 18132494



In [16]:

    
import re

def list2row(mylist, fmt='%s'):
    return ' & '.join([fmt % i for i in mylist])

def verb(s, delim=';'):
    return '\\verb' + delim + s + delim

def clean(s):
    return re.sub('_', '\\_', re.sub('\$', '\\$', s))

def make_user_table(top_users):
    outf = open('users.tex', 'wt')
    outf.write('\\begin{table*}[t]\n\\centering\n\\begin{tabular}{|c|c|l|}\n\\hline\n')
    outf.write(list2row(['{\\bf Category}', '{\\bf Value}', '{\\bf Top Accounts}']) +
                   '\\\\\n\\hline\n')
    for ci, category in enumerate(outputs):
        for li, label in enumerate(outputs[category]):
            row = [''] * 3
            row[0] = category if li == 0 else ''
            row[1] = clean(label)
            row[2] = ', '.join(clean(x) for x in top_users[category][label])
            outf.write(list2row(row) + '\\\\\n')
        outf.write('\\hline\n')
    outf.write('\\end{tabular}\\caption{Accounts with the highest estimated coefficients for each category.\\label{tab.users}}\n\\end{table*}\n')

make_user_table(top_users)



In [17]:

    
!cat users.tex









    



\begin{table*}[t]
\centering
\begin{tabular}{|c|c|l|}
\hline
{\bf Category} & {\bf Value} & {\bf Top Accounts}\\
\hline
Gender & Male & AdamSchefter, SportsCenter, espn, WIRED, mortreport\\
 & Female & TheEllenShow, Oprah, MarthaStewart, Pinterest, FoodNetwork\\
\hline
Age & 18-24 & PlayStation, IGN, RockstarGames, Ubisoft, steam\_games\\
 & 25-34 & azizansari, lenadunham, mindykaling, WIRED\\
 & 35-44 & TMZ, Oprah, BarackObama, andersoncooper, cnnbrk\\
 & 45-54 & cnnbrk, FoxNews, AP, CNN, ABC\\
 & 55-64 & FoxNews, cnnbrk, AP, WSJ, WhiteHouse\\
 & 65+ & FoxNews, cnnbrk, WSJ, AP, DRUDGE\_REPORT\\
\hline
Income & \$0-50k & YouTube, PlayStation, IGN, RockstarGames, KevinHart4real\\
 & \$50-100k & cnnbrk, espn, SportsCenter, AP, WSJ\\
 & \$100-150k & WSJ, TheEconomist, nytimes, washingtonpost, Forbes\\
 & \$150k+ & WSJ, TheEconomist, nytimes, Forbes, business\\
\hline
Education & No College & YouTube, PlayStation, RockstarGames, katyperry, KevinHart4real\\
 & College & ConanOBrien, danieltosh, azizansari, WIRED\\
 & Grad School & NewYorker, nytimes, TheEconomist, WSJ, washingtonpost\\
\hline
Children & No Kids & NewYorker, StephenAtHome, nytimes, TheEconomist, WIRED\\
 & Has Kids & parentsmagazine, parenting, TheEllenShow, thepioneerwoman, HuffPostParents\\
\hline
Ethnicity & Caucasian & FoxNews, jimmyfallon, TheEllenShow, blakeshelton, cnnbrk\\
 & Hispanic & latimes, Lakers, SFGate, kobebryant, SFist\\
 & African American & KevinHart4real, Drake, iamdiddy, Tip, kendricklamar\\
 & Asian & SFGate, SFist, TechCrunch, WIRED, SFWeekly\\
\hline
\end{tabular}\caption{Accounts with the highest estimated coefficients for each category.\label{tab.users}}
\end{table*}

Comparison with supervised learning (logistic regression)

We manually labeled individual Twitter users with race/gender to compare accuracy of the model trained above. For comparison, we also train a supervised logistic regression classifier, which uses the same feature vector as our model.

Because the labeled data contains personally identifiable information, we have elected not to share it publicly. Please contact the authors to discuss possible data sharing agreements.



In [20]:

    
# Compute accuracy on users labeled by race.
from collections import Counter
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.utils.extmath import safe_sparse_dot

def train_demo_model(ylabels):
    indices = [i for i, bid in enumerate(brand_ids) if len(set(ylabels) & set(id2brand[bid]['demo'].keys())) == len(ylabels)]
    print 'training race model on', len(indices), 'brands'
    y = npa([get_yvalues(ylabels, id2brand[brand_ids[bid]]['demo']) for bid in indices])
    thisX = X[indices].toarray()
    m = get_model()
    m.fit(thisX, scale(y))
    m.coef_ = m.coef_[0:3]
    return m

def map_race_label(label):
    return ['white', 'latin', 'black', 'asian'].index(label)
    
def read_labeled_data(fname, label_map_f):
    users = []
    labels = []
    friends = []
    for line in open(fname):
        parts = line.strip().split()
        if len(parts) > 10:
            users.append(parts[0])
            labels.append(label_map_f(parts[1]))
            friends.append(Counter([int(x) for x in parts[2:]]))
    X_race = vec.transform(friends)
    return users, npa(labels), X_race

def label_by_reg(X_race, m):
    """ Scale coefficients per class to make them comparable;
    then keep only positive coefficients. """
    coef = m.coef_
    coef = scale(m.coef_, axis=0)  # Scale by class label
    for i in range(len(coef)):
        topi = np.where(coef[i] > 0)
        topv = coef[i][topi]
        coef[i] = [0] * len(coef[i])
        coef[i][topi] = topv
    pred = safe_sparse_dot(coef, X_race.T, dense_output=True).T
    return np.argmax(pred, axis=1)

def label_by_clf(X_race, y_race, pct):
    clf = LogisticRegression()
    cv = KFold(len(y_race), 3, shuffle=True, random_state=123456)
    preds = np.zeros(len(y_race), int)
    for train, test in cv:
        train = random.sample(train, int(len(train) * pct))
        clf.fit(X_race[train], y_race[train])
        preds[test] = clf.predict(X_race[test])
    return preds

def eval_labeled(truth, pred, labels):
    label_idx = np.arange(len(labels))
    acc, f1 = (accuracy_score(pred, truth),
                       f1_score(truth, pred, labels=label_idx,
                                average='macro', pos_label=None))
    print 'acc=', acc, 'f1=', f1
    print confusion_matrix(truth, pred)
    return f1

    
def do_race_expt():
    labels = ['Caucasian', 'Hispanic', 'African American', 'Asian']
    users_race, y_race, X_race = read_labeled_data('../data/race.txt', map_race_label)
    X_race = X_race[np.where(y_race != 3)]
    y_race = y_race[np.where(y_race != 3)]
    print 'X_race shape=', str(X_race.get_shape()), 'total matches=', X_race.sum()
    labels = labels[0:3]
    reg = train_demo_model(labels)
    pred_reg = label_by_reg(X_race, reg)
    reg_f1 = eval_labeled(y_race, pred_reg, labels)
    clf_f1s = []
    for pct in [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]:
        pred_clf = label_by_clf(X_race, y_race, pct)
        clf_f1s.append(eval_labeled(y_race, pred_clf, labels))
    return reg_f1, clf_f1s

race_results = do_race_expt()









    



X_race shape= (615, 46649) total matches= 17366.0
training race model on 1035 brands
acc= 0.614634146341 f1= 0.609443159441
[[128 108  29]
 [ 34  86   9]
 [ 36  21 164]]
acc= 0.473170731707 f1= 0.459398894335
[[ 87  70 108]
 [ 14  61  54]
 [ 21  57 143]]
acc= 0.570731707317 f1= 0.494632426397
[[235  13  17]
 [ 91  26  12]
 [124   7  90]]
acc= 0.591869918699 f1= 0.556717451438
[[176  12  77]
 [ 58  41  30]
 [ 63  11 147]]
acc= 0.617886178862 f1= 0.551906206132
[[210   6  49]
 [ 70  28  31]
 [ 77   2 142]]
acc= 0.648780487805 f1= 0.602824487084
[[192   8  65]
 [ 60  39  30]
 [ 51   2 168]]
acc= 0.671544715447 f1= 0.62163937133
[[222  11  32]
 [ 70  41  18]
 [ 65   6 150]]
acc= 0.684552845528 f1= 0.649582660205
[[224   8  33]
 [ 62  51  16]
 [ 68   7 146]]
acc= 0.70406504065 f1= 0.668361929745
[[218   8  39]
 [ 59  52  18]
 [ 51   7 163]]
acc= 0.712195121951 f1= 0.678440658144
[[223   9  33]
 [ 61  53  15]
 [ 56   3 162]]
acc= 0.713821138211 f1= 0.682798607784
[[225  10  30]
 [ 59  56  14]
 [ 58   5 158]]



In [21]:

    
# Compute accuracy on data labeled by gender.
def map_gender_label(label):
    return ['Male', 'Female'].index(label)

def do_gender_expt():
    labels = ['Male', 'Female']
    users_gender, y_gender, X_gender = read_labeled_data('../data/gender.txt', map_gender_label)
    print 'X_gender shape=', str(X_gender.get_shape()), 'total matches=', X_gender.sum()
    
    reg = train_demo_model(labels)
    pred_reg = label_by_reg(X_gender, reg)
    reg_f1 = eval_labeled(y_gender, pred_reg, labels)
    clf_f1s = []
    for pct in [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.]:
        pred_clf = label_by_clf(X_gender, y_gender, pct)
        clf_f1s.append(eval_labeled(y_gender, pred_clf, labels))
    return reg_f1, clf_f1s
    
gender_results = do_gender_expt()









    



X_gender shape= (213, 46649) total matches= 7516.0
training race model on 1066 brands
acc= 0.741784037559 f1= 0.741784037559
[[79  7]
 [48 79]]
acc= 0.553990610329 f1= 0.520872303649
[[31 55]
 [40 87]]
acc= 0.619718309859 f1= 0.451327715058
[[  7  79]
 [  2 125]]
acc= 0.680751173709 f1= 0.616825396825
[[ 29  57]
 [ 11 116]]
acc= 0.680751173709 f1= 0.578266946191
[[ 20  66]
 [  2 125]]
acc= 0.708920187793 f1= 0.63275862069
[[ 27  59]
 [  3 124]]
acc= 0.68544600939 f1= 0.591848991849
[[ 22  64]
 [  3 124]]
acc= 0.741784037559 f1= 0.68566368832
[[ 34  52]
 [  3 124]]
acc= 0.732394366197 f1= 0.674233276986
[[ 33  53]
 [  4 123]]
acc= 0.741784037559 f1= 0.679331015794
[[ 32  54]
 [  1 126]]
acc= 0.741784037559 f1= 0.68566368832
[[ 34  52]
 [  3 124]]



In [22]:

    
def plot_labeled_results(reg_results, clf_results, xticks, axis, title):
    axis.plot(xticks, [reg_results] * len(clf_results), 'g--', label='regression', lw=3)
    axis.plot(xticks, clf_results, 'bo-', label='classification')
    axis.set_title(title, size=16)
    
def make_labeled_plot(gender_results, race_results):
    xticks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    figure, axes = plt.subplots(2, 1, sharex=True)
    plot_labeled_results(gender_results[0], gender_results[1], xticks, axes[0], 'Gender')
    plot_labeled_results(race_results[0], race_results[1], xticks, axes[1], 'Ethnicity')
    axes[1].set_ylabel((' ' * 25) + 'Macro F1', size=16)
    axes[1].set_xlabel('% of labeled training data', size=16)
    axes[1].legend(loc='lower right')
    plt.savefig('labeled.pdf', bbox_inches='tight')

    
make_labeled_plot(gender_results, race_results)



In [23]:

    
# Plot F1 as the number of friends per user increases.
import random

def sample_friends(X, n):
    X_sample = X.copy()
    for i, xi in enumerate(X_sample):
        nnz = xi.getnnz()
        if n < nnz:
            nzcols = xi.nonzero()[1]
            indices = random.sample(range(nnz), nnz - n)
            X_sample[i, nzcols[indices]] = 0.
            X_sample.eliminate_zeros()
    return X_sample

def _do_nfriends_expt(XX, y, m, labels):
    ys = []
    stderrs = []
    xs = [1,2,3,4,5,10,20,30,40,50]  # range(1, 50)[::5]  # [::50]
    for nfriends in xs:
        f1s = []
        for sample in range(5):
            X_sample = sample_friends(XX, nfriends)
            pred_reg = label_by_reg(X_sample, m)
            reg_f1 = eval_labeled(y, pred_reg, labels)
            f1s.append(reg_f1)
        ys.append(np.mean(f1s))
        stderrs.append(np.std(f1s) / math.sqrt(len(f1s)))
    return npa(xs), npa(ys), npa(stderrs)

def do_nfriends_expt():
    random.seed(1234)
    labels = ['Caucasian', 'Hispanic', 'African American', 'Asian']
    users_race, y_race, X_race = read_labeled_data('../data/race.txt', map_race_label)
    labels = labels[:3]
    m = train_demo_model(labels)
    xs_r, ys_r, stderrs_r = _do_nfriends_expt(X_race, y_race, m, labels)

    labels = ['Male', 'Female']
    users_race, y_gender, X_gender = read_labeled_data('../data/gender.txt', map_gender_label)
    m = train_demo_model(labels)
    xs_g, ys_g, stderrs_g = _do_nfriends_expt(X_gender, y_gender, m, labels)
    
    figure, axes = plt.subplots(2, 1, sharex=True)
    axes[0].plot(xs_g, ys_g, 'bo-', ms=3)
    axes[0].fill_between(xs_g, ys_g - stderrs_g, ys_g + stderrs_g, alpha=0.4, facecolor='b')
    axes[0].set_title('Gender', size=16)
    axes[1].plot(xs_r, ys_r, 'bo-', ms=3)
    axes[1].fill_between(xs_r, ys_r - stderrs_r, ys_r + stderrs_r, alpha=0.4, facecolor='b')
    axes[1].set_title('Ethnicity', size=16)
    
    axes[1].set_xlabel('# of friends per user', size=16)
    axes[1].set_ylabel((' ' * 25) + 'Macro F1', size=16)
    axes[1].legend(loc='lower right')
    plt.savefig('friends.pdf', bbox_inches='tight')

do_nfriends_expt()









    



training race model on 1035 brands
acc= 0.523577235772 f1= 0.468378410263
[[208  33  24]
 [ 84  34  11]
 [122  19  80]]
acc= 0.546341463415 f1= 0.502988677989
[[208  39  18]
 [ 70  45  14]
 [117  21  83]]
acc= 0.533333333333 f1= 0.491986340539
[[199  46  20]
 [ 84  39   6]
 [119  12  90]]
acc= 0.544715447154 f1= 0.489171065258
[[212  34  19]
 [ 89  33   7]
 [114  17  90]]
acc= 0.541463414634 f1= 0.494296761138
[[215  34  16]
 [ 73  49   7]
 [122  30  69]]
acc= 0.556097560976 f1= 0.536612650497
[[172  66  27]
 [ 58  57  14]
 [ 84  24 113]]
acc= 0.530081300813 f1= 0.505293527512
[[170  67  28]
 [ 69  48  12]
 [ 86  27 108]]
acc= 0.559349593496 f1= 0.536328768738
[[183  51  31]
 [ 61  57  11]
 [ 89  28 104]]
acc= 0.521951219512 f1= 0.500945256529
[[162  67  36]
 [ 71  50   8]
 [ 79  33 109]]
acc= 0.552845528455 f1= 0.535705168866
[[172  64  29]
 [ 59  60  10]
 [ 87  26 108]]
acc= 0.586991869919 f1= 0.577624022213
[[156  82  27]
 [ 45  73  11]
 [ 54  35 132]]
acc= 0.567479674797 f1= 0.555891345432
[[157  73  35]
 [ 47  66  16]
 [ 73  22 126]]
acc= 0.577235772358 f1= 0.565501850753
[[158  78  29]
 [ 52  66  11]
 [ 65  25 131]]
acc= 0.559349593496 f1= 0.549994696586
[[151  80  34]
 [ 53  67   9]
 [ 66  29 126]]
acc= 0.564227642276 f1= 0.550361176606
[[153  76  36]
 [ 57  60  12]
 [ 68  19 134]]
acc= 0.585365853659 f1= 0.578805807627
[[151  82  32]
 [ 42  76  11]
 [ 62  26 133]]
acc= 0.60325203252 f1= 0.593915596517
[[153  77  35]
 [ 44  74  11]
 [ 53  24 144]]
acc= 0.573983739837 f1= 0.562673861259
[[155  80  30]
 [ 47  68  14]
 [ 60  31 130]]
acc= 0.557723577236 f1= 0.550584315802
[[147  82  36]
 [ 46  73  10]
 [ 60  38 123]]
acc= 0.582113821138 f1= 0.575694351801
[[142  84  39]
 [ 50  74   5]
 [ 52  27 142]]
acc= 0.583739837398 f1= 0.580127585195
[[131  98  36]
 [ 35  81  13]
 [ 57  17 147]]
acc= 0.573983739837 f1= 0.564793573553
[[140  89  36]
 [ 50  69  10]
 [ 49  28 144]]
acc= 0.570731707317 f1= 0.567549951005
[[131 101  33]
 [ 41  79   9]
 [ 52  28 141]]
acc= 0.556097560976 f1= 0.552759027108
[[130 100  35]
 [ 39  78  12]
 [ 54  33 134]]
acc= 0.570731707317 f1= 0.563778054987
[[132  97  36]
 [ 42  73  14]
 [ 52  23 146]]
acc= 0.562601626016 f1= 0.55924566511
[[117 105  43]
 [ 36  82  11]
 [ 46  28 147]]
acc= 0.593495934959 f1= 0.588882031032
[[132 106  27]
 [ 42  79   8]
 [ 44  23 154]]
acc= 0.60325203252 f1= 0.595797962557
[[131  97  37]
 [ 37  80  12]
 [ 40  21 160]]
acc= 0.60487804878 f1= 0.59955992084
[[141  95  29]
 [ 37  82  10]
 [ 44  28 149]]
acc= 0.60162601626 f1= 0.599059062826
[[128 109  28]
 [ 32  88   9]
 [ 42  25 154]]
acc= 0.60162601626 f1= 0.595569895916
[[128 109  28]
 [ 37  81  11]
 [ 37  23 161]]
acc= 0.611382113821 f1= 0.605458418636
[[129 106  30]
 [ 35  84  10]
 [ 36  22 163]]
acc= 0.60325203252 f1= 0.598441306645
[[126 109  30]
 [ 33  85  11]
 [ 39  22 160]]
acc= 0.60162601626 f1= 0.596352406618
[[128 107  30]
 [ 36  83  10]
 [ 37  25 159]]
acc= 0.60487804878 f1= 0.598409990251
[[129 106  30]
 [ 37  81  11]
 [ 39  20 162]]
acc= 0.611382113821 f1= 0.606589290735
[[128 107  30]
 [ 34  86   9]
 [ 38  21 162]]
acc= 0.606504065041 f1= 0.601106398175
[[127 105  33]
 [ 34  85  10]
 [ 37  23 161]]
acc= 0.614634146341 f1= 0.609053072566
[[130 106  29]
 [ 34  85  10]
 [ 37  21 163]]
acc= 0.609756097561 f1= 0.604073669623
[[127 107  31]
 [ 34  85  10]
 [ 35  23 163]]
acc= 0.619512195122 f1= 0.613602779394
[[133 103  29]
 [ 34  85  10]
 [ 37  21 163]]
acc= 0.617886178862 f1= 0.612043124692
[[129 108  28]
 [ 35  85   9]
 [ 35  20 166]]
acc= 0.608130081301 f1= 0.602835308986
[[128 108  29]
 [ 36  84   9]
 [ 36  23 162]]
acc= 0.619512195122 f1= 0.614159313075
[[129 106  30]
 [ 33  87   9]
 [ 36  20 165]]
acc= 0.616260162602 f1= 0.610983333777
[[128 107  30]
 [ 35  86   8]
 [ 36  20 165]]
acc= 0.619512195122 f1= 0.61322918353
[[130 105  30]
 [ 35  85   9]
 [ 34  21 166]]
acc= 0.611382113821 f1= 0.606387094069
[[126 110  29]
 [ 34  86   9]
 [ 36  21 164]]
acc= 0.619512195122 f1= 0.613603272752
[[128 108  29]
 [ 34  86   9]
 [ 34  20 167]]
acc= 0.611382113821 f1= 0.60627485149
[[128 109  28]
 [ 35  85   9]
 [ 36  22 163]]
acc= 0.616260162602 f1= 0.611291363928
[[128 108  29]
 [ 33  87   9]
 [ 36  21 164]]
acc= 0.614634146341 f1= 0.609361812192
[[127 109  29]
 [ 34  86   9]
 [ 36  20 165]]
training race model on 1066 brands
acc= 0.516431924883 f1= 0.48823158926
[[80  6]
 [97 30]]
acc= 0.492957746479 f1= 0.462121212121
[[ 78   8]
 [100  27]]
acc= 0.511737089202 f1= 0.486746987952
[[78  8]
 [96 31]]
acc= 0.492957746479 f1= 0.464624837088
[[77  9]
 [99 28]]
acc= 0.464788732394 f1= 0.420311306341
[[ 79   7]
 [107  20]]
acc= 0.572769953052 f1= 0.551869234504
[[84  2]
 [89 38]]
acc= 0.553990610329 f1= 0.543691509753
[[75 11]
 [84 43]]
acc= 0.586854460094 f1= 0.577913889389
[[78  8]
 [80 47]]
acc= 0.600938967136 f1= 0.5928624435
[[79  7]
 [78 49]]
acc= 0.572769953052 f1= 0.55715232242
[[81  5]
 [86 41]]
acc= 0.553990610329 f1= 0.543691509753
[[75 11]
 [84 43]]
acc= 0.619718309859 f1= 0.616983016983
[[75 11]
 [70 57]]
acc= 0.605633802817 f1= 0.598185411427
[[79  7]
 [77 50]]
acc= 0.629107981221 f1= 0.623498086949
[[80  6]
 [73 54]]
acc= 0.619718309859 f1= 0.614828209765
[[78  8]
 [73 54]]
acc= 0.638497652582 f1= 0.633849038912
[[80  6]
 [71 56]]
acc= 0.600938967136 f1= 0.596635847165
[[75 11]
 [74 53]]
acc= 0.633802816901 f1= 0.632823550212
[[73 13]
 [65 62]]
acc= 0.657276995305 f1= 0.656185737977
[[76 10]
 [63 64]]
acc= 0.643192488263 f1= 0.638983050847
[[80  6]
 [70 57]]
acc= 0.638497652582 f1= 0.635897435897
[[77  9]
 [68 59]]
acc= 0.647887323944 f1= 0.643359453486
[[81  5]
 [70 57]]
acc= 0.657276995305 f1= 0.655332165895
[[78  8]
 [65 62]]
acc= 0.633802816901 f1= 0.630208333333
[[78  8]
 [70 57]]
acc= 0.661971830986 f1= 0.66028708134
[[78  8]
 [64 63]]
acc= 0.727699530516 f1= 0.727693528478
[[78  8]
 [50 77]]
acc= 0.694835680751 f1= 0.694161567519
[[79  7]
 [58 69]]
acc= 0.69014084507 f1= 0.689805825243
[[77  9]
 [57 70]]
acc= 0.704225352113 f1= 0.704121006328
[[77  9]
 [54 73]]
acc= 0.694835680751 f1= 0.694161567519
[[79  7]
 [58 69]]
acc= 0.746478873239 f1= 0.746473285135
[[79  7]
 [47 80]]
acc= 0.727699530516 f1= 0.727693528478
[[78  8]
 [50 77]]
acc= 0.704225352113 f1= 0.704121006328
[[77  9]
 [54 73]]
acc= 0.75117370892 f1= 0.75117370892
[[80  6]
 [47 80]]
acc= 0.75117370892 f1= 0.751151768985
[[79  7]
 [46 81]]
acc= 0.723004694836 f1= 0.72290697418
[[79  7]
 [52 75]]
acc= 0.737089201878 f1= 0.737037037037
[[80  6]
 [50 77]]
acc= 0.746478873239 f1= 0.746473285135
[[79  7]
 [47 80]]
acc= 0.75117370892 f1= 0.751151768985
[[79  7]
 [46 81]]
acc= 0.741784037559 f1= 0.741784037559
[[79  7]
 [48 79]]
acc= 0.746478873239 f1= 0.746473285135
[[79  7]
 [47 80]]
acc= 0.741784037559 f1= 0.741761269701
[[78  8]
 [47 80]]
acc= 0.741784037559 f1= 0.741784037559
[[79  7]
 [48 79]]
acc= 0.746478873239 f1= 0.746473285135
[[79  7]
 [47 80]]
acc= 0.727699530516 f1= 0.727645502646
[[79  7]
 [51 76]]
acc= 0.746478873239 f1= 0.746473285135
[[79  7]
 [47 80]]
acc= 0.732394366197 f1= 0.732370770418
[[79  7]
 [50 77]]
acc= 0.746478873239 f1= 0.746473285135
[[79  7]
 [47 80]]
acc= 0.75117370892 f1= 0.751151768985
[[79  7]
 [46 81]]
acc= 0.737089201878 f1= 0.737083406807
[[79  7]
 [49 78]]






    



/Users/awculott/.local/lib/python2.7/site-packages/matplotlib/axes/_axes.py:475: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "