In [10]:
from pprint import pprint
import crowdastro.crowd.util
from crowdastro.crowd.raykar import RaykarClassifier
import crowdastro.experiment.experiment_rgz_raykar as rgzr
from crowdastro.experiment.results import Results
import crowdastro.plot
import h5py
import matplotlib.pyplot as plt
import numpy
import sklearn.metrics
%matplotlib inline
CROWDASTRO_PATH = '../data/crowdastro.h5' # Generated by the crowdastro pipeline.
RESULTS_PATH = '../data/results_rgz_raykar.h5' # Generated by crowdastro.experiment.experiment_rgz_raykar.
In [2]:
with h5py.File(CROWDASTRO_PATH, 'r') as crowdastro_h5:
norris_labels = crowdastro_h5['/wise/cdfs/norris_labels'].value
crowd_labels = numpy.ma.MaskedArray(
crowdastro_h5['/wise/cdfs/rgz_raw_labels'],
mask=crowdastro_h5['/wise/cdfs/rgz_raw_labels_mask'])
top_10 = rgzr.top_n_accurate_targets(crowdastro_h5, n_annotators=10)
In [4]:
approx_alphas = []
approx_betas = []
for t in range(top_10.shape[0]):
cm = sklearn.metrics.confusion_matrix(norris_labels[~top_10[t].mask],
top_10[t][~top_10[t].mask])
alpha = cm[1, 1] / cm.sum(axis=1)[1]
beta = cm[0, 0] / cm.sum(axis=1)[0]
approx_alphas.append(alpha)
approx_betas.append(beta)
print('approximate alpha:')
pprint(approx_alphas)
print('approximate beta:')
pprint(approx_betas)
crowdastro.plot.vertical_scatter(['$\\alpha$', '$\\beta$'], [approx_alphas, approx_betas], line=True)
plt.show()
It seems that higher values of $\alpha$ are correlated with lower values of $\beta$, and vice versa. This seems to make some intuitive sense.
In [6]:
results = Results.from_path(RESULTS_PATH)
In [22]:
raykar_alphas = []
raykar_betas = []
raykar_classifiers = []
for split in range(results.n_splits):
rc = results.get_model('Raykar(Top-10-accurate)', split)
rc = RaykarClassifier.unserialise(rc)
raykar_alphas.append(rc.a_)
raykar_betas.append(rc.b_)
raykar_classifiers.append(rc)
raykar_alphas = numpy.mean(raykar_alphas, axis=0)
raykar_betas = numpy.mean(raykar_betas, axis=0)
print('raykar alpha:')
pprint(list(raykar_alphas))
print('raykar beta:')
pprint(list(raykar_betas))
crowdastro.plot.vertical_scatter(['$\\alpha$', '$\\beta$'], [raykar_alphas, raykar_betas], line=True)
plt.ylim(0, 0.005)
plt.show()
These numbers are all really small. This may be because the Raykar algorithm doesn't account for the fact that the labels are partially observed. If this is the case, then the Raykar algorithm is estimating the values as
$$ \alpha = \frac{\text{true positives}}{\text{true positives} + \text{false negatives} + \text{unobserved}} $$$$ \beta = \frac{\text{true negatives}}{\text{true negatives} + \text{false positives} + \text{unobserved}} $$where we want
$$ \alpha = \frac{\text{true positives}}{\text{true positives} + \text{false negatives}} $$$$ \beta = \frac{\text{true negatives}}{\text{true negatives} + \text{false positives}}. $$
In [ ]: