Expert Voting

This notebook explores the results of the Expert user study, and generates the Figures used for that section of the Brut Paper


In [1]:
%matplotlib inline

import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from scipy import stats
from matplotlib.legend_handler import HandlerPathCollection
from matplotlib.collections import PathCollection
from matplotlib import rc_context
from bubbly.util import roc_curve

import matplotlib
matplotlib.rcParams['font.size'] = 10

def white_bg():
    return rc_context({'axes.facecolor': 'white'})

figures = {}

Load in the data, aggregate the votes for each unique field


In [2]:
votes = pd.DataFrame(json.load(open('../data/expert_votes.json')))
votes = votes.join(pd.get_dummies(votes.vote, prefix='v'))

def mode(x):
    return stats.mode(x.values)[0][0]

def plurality(x):
    return stats.mode(x.values)[1][0] / x.size

def zscore(x):
    return (x - x.mean()) / x.std()

meanvote = votes.groupby(['image_id']).vote.agg({'mean_vote':np.mean, 
                                                 "std_vote": np.std, 
                                                 'mode': mode, 
                                                 'plurality': plurality})
fractions = votes.groupby(['image_id'])['v_1', 'v_2', 'v_3'].mean()

meanvote = pd.merge(meanvote, fractions, left_index=True, right_index=True)
meanvote = pd.merge(meanvote, votes[['hitrate', 'lon', 'lat', 'mlconf', 'image_id']], 
                    right_on='image_id', left_index=True, how='left')
meanvote = meanvote.drop_duplicates(cols='image_id')

meanvote['joint'] = zscore(meanvote.mlconf) + zscore(meanvote.hitrate)

brut = meanvote[meanvote.hitrate.isnull()]
mwp = meanvote[~meanvote.hitrate.isnull()]

In [3]:
print mwp.hitrate.mean(), mwp.hitrate.std()
print mwp.mlconf.mean(), mwp.mlconf.std()


0.214333333333 0.113798665913
0.100444444444 0.589302189634

MWP Sample


In [4]:
def rank(x):
    ind = np.argsort(x)
    result = np.empty(x.size)
    result[ind] = np.arange(x.size)
    return result

def colorplot(df, x, y, hide_right=False, **kwargs):
    colors = {1: '#E31A1C', 2: '#FF7F00', 3: '#1F78B4'}
    labels = {1: "Non-bubble", 2: "Ambiguous/\nIrregular", 3:"Bubble"}
    
    for i in [3, 2, 1]:
        sub = df[df.mode == i]
        kwargs.setdefault('s', 70)
        plt.scatter(sub[x], sub[y], facecolors='none', lw=2, edgecolors=colors[i], label=labels[i], **kwargs)

    clf = LogisticRegression(C=1e12)
    x = df[x].values.reshape(-1, 1)
    y = (df.mode == 3).values

    clf.fit(x, y)

    prob = clf.predict_log_proba(x)
    like = prob[y==0, 0].sum() + prob[y==1, 1].sum()
    
    xlo, xhi = x.min(), x.max()
    x = np.linspace(x.min() - .5 * x.ptp(), x.max() + .5 * x.ptp(), 100).reshape(-1, 1)
    y = clf.predict_proba(x)[:, 1]

    #plot
    ax1 = plt.gca()
    ax2 = ax1.twinx()
    plt.sca(ax2)
    plt.yticks([0, .25, .5, .75, 1])

    if hide_right:
        ax2.set_yticklabels([' ', ' ', ' ', ' ', ' '])
    else:
        ax2.set_ylabel("P(Bubble | Score)")

    plt.ylim(-.05, 1.05)
    plt.grid(False)

       
    plt.plot(x, y, color='#333333', lw=.8, label="P(Bubble | Score)")
    plt.title("Log-Likelihood: %0.1f" % like)            
    plt.sca(ax1)
    plt.xlim(xlo - .1 * (xhi - xlo), xhi + .1 * (xhi - xlo))
    return clf
        
def bubble_legend(**kwargs):
    
    h, l = [], []
    for ax in plt.gcf().axes:
        for hl in zip(*ax.get_legend_handles_labels()):
            if len(hl) == 2 and hl[1] not in l:
                h.append(hl[0])
                l.append(hl[1])
    
    kwargs.setdefault('bbox_to_anchor', (1, 0, 1, 1))
    hmap = {PathCollection: HandlerPathCollection(numpoints=1)}
    with white_bg():
        return plt.legend(h, l, handler_map=hmap, loc='upper left', **kwargs)

In [5]:
def stepplot(binx, bini, values, **kwargs):
    assert binx.size == values.size + 1
    ind = np.argsort(bini)
    x = np.hstack((binx[bini[ind]], binx.max()))
    y = np.hstack((values[ind], values[ind][-1]))
    plt.plot(x, y, drawstyle='steps-post', **kwargs)

Comparison of Hit Rate vs Brute Score for the MWP sample

Both perform similarly well as a means to separate true and false positives in the MWP catalog


In [6]:
fig = plt.figure(tight_layout=True, figsize=(7, 4.5))

axs = plt.subplot(231), plt.subplot(232), plt.subplot(234), plt.subplot(235)

plt.sca(axs[0])
colorplot(mwp, 'hitrate', 'plurality', hide_right=True, s=30)
plt.xlabel("Hit Rate")
plt.ylabel("Expert Consensus")
plt.annotate('a', (.5, .4), fontsize=20)
plt.yticks([.4, .7, 1])

plt.sca(axs[1])
colorplot(mwp, 'mlconf', 'plurality', s=30)
plt.xlabel("Brut Score")

plt.yticks([.4, .7, 1])
labels = ['' for _ in axs[1].get_yticklabels()]
axs[1].set_yticklabels(labels)
plt.annotate('b', (.8, .4), fontsize=20)


plt.sca(axs[2])
clf = colorplot(mwp, 'joint', 'plurality', hide_right=True, s=30)
plt.xlabel("Joint")
plt.ylabel("Expert Consensus")
plt.yticks([.4, .7, 1])
plt.annotate('c', (4, .4), fontsize=20)
print "Joint regression coef: %f\t intercept: %f" % (clf.coef_, clf.intercept_)

plt.sca(axs[3])
dummy = mwp.copy()
dummy.mode = np.random.randint(0, 2, dummy.shape[0]) * 2 + 1
dummy['test'] =  ((dummy.mode == 3).astype(np.int) - 0.5) + np.random.uniform(-.3, .3, dummy.shape[0])
dummy['plurality'] = 1

colorplot(dummy, 'test', 'plurality', s=30)
plt.ylim(.3, 1.1)
plt.yticks([.4, .7, 1])
labels = ['' for _ in axs[1].get_yticklabels()]
axs[3].set_yticklabels(labels)

plt.xlabel("Ideal")
plt.annotate('d', (.6, .4), fontsize=20)

#plt.sca(axs[3])
l = bubble_legend(title="MWP Sample", 
                  bbox_to_anchor=(1.4, 0, 1, 1), fontsize=11, frameon=False)

figures['mwp_score'] = fig


Joint regression coef: 1.539796	 intercept: 1.868260
/Users/beaumont/anaconda/lib/python2.7/site-packages/matplotlib/figure.py:1595: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

Note that the classifications are somewhat independent, so that adding both metrics improves the ROC curve by 5-10%


In [7]:
is_bubble = mwp.mode == 3

fig = plt.figure(tight_layout=True)

weight = mwp.mlconf.std() / mwp.hitrate.std()

roc_curve(is_bubble, mwp.hitrate, lw=4, label='Hit Rate')
roc_curve(is_bubble, mwp.mlconf, lw=4, label='Brut Score')
roc_curve(is_bubble, mwp.joint, lw=4, label='Joint')
plt.ylim(.4, 1.05)
with white_bg():
    plt.legend(title="MWP Sample ROC", loc='lower right')

figures['roc'] = fig


Brut Sample

The Brut sample uniformly samples the full range of Brut scores. At a fixed Brut Score, the samples have the same statistical distribution as the blind search. Thus, this sample can be used to estimate the Posteior probability of having a bubble, given the Brut Score


In [8]:
fig = plt.figure(figsize=(7, 4))

plt.subplot(121)
colorplot(brut, 'mlconf', 'plurality')

plt.ylabel("Expert Consensus")
plt.yticks([.4, .7, 1])
plt.xlabel("Brut Score")
plt.ylim(.3, 1.05)

bubble_legend(title='Uniform Sample', bbox_to_anchor=(1.25, 0, .75, 1), frameon=False)

figures['uniform_score'] = fig


Is the logistic fit calibrated?

They seem decently well calibrated


In [9]:
from sklearn.cross_validation import train_test_split

f = plt.figure()
def calibration_plot(df):
    clf = LogisticRegression(C=1e12)
    x = (df.mlconf).values.reshape(-1, 1)
    y = (df.mode == 3).values.astype(np.int)

    xr, xs, yr, ys = train_test_split(x, y)
    clf.fit(xr, yr)

    def simulate(clf, x):
        yp = clf.predict_proba(x)[:, 1]
        return (np.random.uniform(size=yp.size) < yp).astype(np.int)

    scores = [clf.score(xs, simulate(clf, xs)) for _ in range(3000)]
    plt.hist(scores, bins=np.linspace(.3, 1, 10), label="Realizations of \nLogistic Model")
    plt.axvline(clf.score(xs, ys), label="Data", color='k')    
    with white_bg():
        plt.legend(loc='upper left')
        
calibration_plot(mwp)
plt.title("Milky Way Sample")
plt.xlabel("Classification Score")
plt.show()

calibration_plot(brut)
plt.title("Uniform Sample")
plt.xlabel("Classification Score")
plt.show()


from sklearn.cross_validation import train_test_split from math import factorial from scipy.stats import binom def confidence_bound(ntotal, nbubble): """ 50% confidence interval on the actual bubble rate, given that nbubbles where observed out of ntotal objects Assumes samples are generated from a binomial distribution, with uniform prior on the rate. """ if not np.isfinite(nbubble): return [0, 0, 0] rates = np.linspace(0, 1, 100) likelihood = np.array([binom(ntotal, r).pmf(nbubble) for r in rates]) prior = np.ones(rates.size) # uniform prior posterior = prior * likelihood posterior /= posterior.sum() cp = np.cumsum(posterior) return rates[np.searchsorted(cp, [.25, .5, .75])] def calibration_plot_2(clf, xtest, ytest): prob = clf.predict_proba(xtest)[:, 1] outcome = ytest data = pd.DataFrame(dict(prob=prob, outcome=outcome)) #group outcomes into bins of similar probability #bins = np.sort(prob)[np.linspace(0, prob.size - 1, 10).astype(np.int)] bins = np.linspace(0, 1, 7) cuts = pd.cut(prob, bins) binwidth = bins[1] - bins[0] #empirical ratio and number of examples in each bin cal = data.groupby(cuts).outcome.agg(['mean', 'count', 'sum']) cal['pmid'] = (bins[:-1] + bins[1:]) / 2 errors = [confidence_bound(ntot, nbub) for ntot, nbub in cal[['count', 'sum']].values] errors = np.column_stack(errors) errors[0] = errors[1] - errors[0] errors[2] -= errors[1] #the calibration plot ax = plt.gca() p = plt.errorbar(cal.pmid, errors[1], errors[[0, 2]], fmt=None, label='25-75% Confidence Interval') plt.plot(cal.pmid, cal.pmid, linestyle='--', lw=1, color='k') plt.plot(cal.pmid, cal['mean'], 's', color = '#1b9e77', mec='none', label='Observed Fraction') plt.ylabel("Empirical P(Bubble)") plt.xlabel("Predicted P(Bubble)") plt.legend(loc='upper left', frameon=False, numpoints=1, scatterpoints=1) plt.xlim(0, 1.05) plt.ylim(-.05, 1.3) def run(df): clf = LogisticRegression(C=1e12) x = (df.mlconf).values.reshape(-1, 1) y = (df.mode == 3).values.astype(np.int) clf.fit(x, y) return calibration_plot_2(clf, x, y) f = plt.figure() run(mwp) plt.annotate('MWP Sample', xy=(1, 0), ha='right', fontsize=14) plt.show() figures['mwp_cal'] = plt.gcf() f = plt.figure() run(brut) plt.annotate('Uniform Sample', xy=(1, 0), ha='right', fontsize=14) plt.show() figures['uniform_cal'] = plt.gcf()

Vote Distribution


In [11]:
from matplotlib import rcParams as rc
rc['axes.color_cycle']


Out[11]:
['#1b9e77', '#d95f02', '#7570b3', '#e7298a', '#66a61e', '#e6ab02', '#a6761d']

In [12]:
minority = dict(red='#FB9A99', orange='#FDBF6F', blue='#A6CEE3')
majority = dict(red='#E31A1C', orange='#FF7F00', blue='#1F78B4')

def vote_plot(df, loc='lower right', figsize=(3,6), labels=False):
    fig = plt.figure(figsize=figsize, dpi=100)    
        
    df = df.sort(columns=['v_1', 'v_2'], ascending=True)
    x = np.arange(df.shape[0])
    rows = range(df.shape[0])

    c = [majority['blue'] for i in rows]
    
    plt.barh(x, width=df.v_3, left=df.v_1 + df.v_2, color=c,
            edgecolor='none', label='Bubble')
    
    c = [majority['orange'] for i in rows]
    plt.barh(x, df.v_2, left=df.v_1, color=c, ec='none', label="Ambiguous")
    
    c = [majority['red'] for i in rows]
    plt.barh(x, df.v_1, color=c, ec='none', label="Non-bubble")
    
    plt.ylabel("Object")
    plt.xlabel("Fraction of votes")
    
    #make a legend. Manually set colors
    b = plt.Rectangle((0,0), 1, 1, facecolor=majority['blue'], edgecolor='none')
    i = plt.Rectangle((0,0), 1, 1, facecolor=majority['orange'], edgecolor='none')
    n = plt.Rectangle((0,0), 1, 1, facecolor=majority['red'], edgecolor='none')
    leg = plt.legend([b, i, n], ['Bubble', 'Irregular/\nAmbiguous', 'Non-Bubble'], loc=loc,
                   fancybox=True)
    leg.get_frame().set_edgecolor('none')
    #for c, t in zip([majority['blue'], majority['orange'], majority['red']],
    #                leg.get_texts()):
    #    t.set_color(c)
    
    plt.grid(False)
    
    if labels:
        plt.yticks(np.arange(df.shape[0]), df.image_id.values)
    else:
        plt.yticks([])
        
    plt.xticks([.25, .5, .75])
    plt.ylim(0, df.shape[0])
    plt.xlim(0, 1)
    
    ax = plt.gca()
    for spine in ['top', 'bottom', 'left', 'right']:
        ax.spines[spine].set_visible(False)
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    ax.xaxis.tick_bottom()
    #plt.grid(axis='x', color='white')

In [13]:
12. / 45.


Out[13]:
0.26666666666666666

In [14]:
(mwp.mode != 3).sum(), (mwp.mode != 3).mean(), mwp.mode.size


Out[14]:
(12, 0.26666666666666666, 45)

In [15]:
plt.figure()
vote_plot(mwp, loc='lower right')
plt.title("MWP Sample")
    
figures['mwp_votes'] = plt.gcf()


<matplotlib.figure.Figure at 0x10ca21b10>

In [16]:
vote_plot(brut, loc='upper left')
plt.title("Uniform Sample")
    
figures['uniform_votes'] = plt.gcf()


Interesting Cases


In [17]:
from IPython.display import Image, display

def show(df, labels):
    f = plt.Figure()
    with white_bg():
        vote_plot(df.set_index(df.image_id).ix[labels], labels=True, figsize=(3, 2))
        plt.title("MWP Sample")
        plt.show()

    for lbl in labels:
        print lbl
        display(Image(url='http://wino.herokuapp.com/static/%s.png' % lbl, width=400))

show(mwp, ['m_344426_42_29_-42',
          'm_330289_657_77_-42',
          'm_44282_206_43_-64',
          'm_333689_-444_28_-2',
          'm_336268_286_46_-41'])


m_344426_42_29_-42
m_330289_657_77_-42
m_44282_206_43_-64
m_333689_-444_28_-2
m_336268_286_46_-41

In [18]:
labels = ['b_44169_91_27_-5',
          'b_37886_-353_66_35']
show(brut, labels)


b_44169_91_27_-5
b_37886_-353_66_35

Summary


In [19]:
def summarize_consensus(df):
    
    print "Fraction of consensus > .5 : %0.2f" % (df.plurality > 0.5).mean()
    print "Fraction of consensus > .75: %0.2f" % (df.plurality > .75).mean()
    print "Average consensus:           %0.2f" % (df.plurality.mean())
    print "Fraction of mode !=3:        %0.2f" % (df.mode != 3).mean()
    
print "MWP Summary"
summarize_consensus(mwp)
print
print "Brut Summary"
summarize_consensus(brut)


MWP Summary
Fraction of consensus > .5 : 0.69
Fraction of consensus > .75: 0.36
Average consensus:           0.69
Fraction of mode !=3:        0.27

Brut Summary
Fraction of consensus > .5 : 0.74
Fraction of consensus > .75: 0.36
Average consensus:           0.72
Fraction of mode !=3:        0.72