In [1]:
from pprint import pprint
import sys
from astropy.coordinates import SkyCoord
import h5py
import numpy
import sklearn.neighbors
import seaborn
sys.path.insert(1, '..')
import crowdastro.active_learning.active_crowd as active_crowd
import crowdastro.active_learning.passive_crowd as passive_crowd
import crowdastro.active_learning.active_crowd_scalar as active_crowd_scalar
CROWDASTRO_H5_PATH = '../data/crowdastro.h5'
TRAINING_H5_PATH = '../data/training.h5'
NORRIS_DAT_PATH = '../data/norris_2006_atlas_classifications_ra_dec_only.dat'
# Load Norris labels.
with h5py.File(TRAINING_H5_PATH, 'r') as training_f:
ir_positions = training_f['positions'].value
ir_tree = sklearn.neighbors.KDTree(ir_positions)
with open(NORRIS_DAT_PATH, 'r') as norris_dat:
norris_coords = [r.strip().split('|') for r in norris_dat]
norris_labels = numpy.zeros((len(ir_positions)))
for ra, dec in norris_coords:
# Find a neighbour.
skycoord = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
ra = skycoord.ra.degree
dec = skycoord.dec.degree
((dist,),), ((ir,),) = ir_tree.query([(ra, dec)])
if dist < 0.1:
norris_labels[ir] = 1
In [2]:
with h5py.File(CROWDASTRO_H5_PATH) as f_h5:
print(sum(1 for i in f_h5['/atlas/cdfs/']['classification_usernames'] if not i)
/ len(f_h5['/atlas/cdfs/']['classification_usernames']))
Only 15% of labels are contributed by anonymous users! That's great for the algorithm. How many users are there?
In [3]:
with h5py.File(CROWDASTRO_H5_PATH) as f_h5:
print(len({i for i in f_h5['/atlas/cdfs/']['classification_usernames'] if i}))
There are 1193 labellers. That's big but hopefully my code can handle it (and if not I'll have to change my methodology a bit).
Let's pull out some labels. This involves matching each IR object to a label for each annotator. If a IR object never appears in a subject that the annotator has labelled, then it should be masked.
In [4]:
with h5py.File(CROWDASTRO_H5_PATH) as f_h5:
annotators = sorted({i for i in f_h5['/atlas/cdfs/classification_usernames'] if i})
n_annotators = len(annotators)
annotator_to_index = {j:i for i, j in enumerate(annotators)}
n_examples = f_h5['/wise/cdfs/numeric'].shape[0]
ir_tree = sklearn.neighbors.KDTree(f_h5['/wise/cdfs/numeric'][:, :2], metric='chebyshev')
In [5]:
with h5py.File(CROWDASTRO_H5_PATH) as f_h5:
labels = numpy.ma.MaskedArray(numpy.zeros((n_annotators, n_examples)),
mask=numpy.ones((n_annotators, n_examples)))
for (atlas_idx, ra, dec), c_user in zip(
f_h5['/atlas/cdfs/classification_positions'],
f_h5['/atlas/cdfs/classification_usernames'],
):
if not c_user:
continue
t = annotator_to_index[c_user]
atlas_ra, atlas_dec = f_h5['/atlas/cdfs/numeric'][atlas_idx, :2]
# t has seen this ATLAS subject, so unmask everything within 1' Chebyshev distance (the radius of an RGZ subject).
nearby = ir_tree.query_radius([[atlas_ra, atlas_dec]], 1 / 60)[0]
labels.mask[t, nearby] = 0
# Label the point nearest the classification as 1.
# (The others are 0 by default.)
if numpy.isnan(ra) or numpy.isnan(dec):
continue
point = ir_tree.query([[ra, dec]], return_distance=False)[0]
labels[t, point] = 1
In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(15, 10))
plt.subplot(1, 2, 1)
plt.hist((~labels.mask).sum(axis=0))
plt.xlabel('Number of viewings')
plt.ylabel('IR objects')
plt.subplot(1, 2, 2)
plt.hist((~labels.mask).sum(axis=1), bins=numpy.linspace(0, 25000, 200))
plt.xlim((0, 1000))
plt.xlabel('Number of annotations')
plt.ylabel('Annotators')
Out[6]:
What is the distribution of balanced accuracies for each annotator? Can we estimate $p(y_i^{(t)} | x_i, z_i)$?
In [7]:
import sklearn.metrics
accuracies = []
annotator_to_accuracy = {}
for t in range(n_annotators):
mask = labels[t].mask
cm = sklearn.metrics.confusion_matrix(norris_labels[~mask], labels[t, ~mask]).astype(float)
if cm.shape == (1, 1):
continue
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
if not (n and p):
continue
ba = (tp / p + tn / n) / 2
accuracies.append(ba)
annotator_to_accuracy[t] = ba
print('{:.02%} of labellers have a balanced accuracy.'.format(len(accuracies) / n_annotators))
In [8]:
plt.hist(accuracies, color='grey', bins=20)
plt.xlabel('Balanced accuracy')
plt.ylabel('Number of annotators')
plt.show()
print('Average: ({:.02f} +- {:.02f})%'.format(numpy.mean(accuracies) * 100, numpy.std(accuracies) * 100))
In [9]:
experts = ("42jkb", "ivywong", "stasmanian", "klmasters", "Kevin",
"akapinska", "enno.middelberg", "xDocR", "DocR", "vrooje", "KWillett")
print([expert for expert in experts if expert.encode('ascii') in annotators])
In [10]:
counts = [(numpy.ma.sum(labels[t] == 1), t) for t in range(n_annotators)]
counts.sort()
pprint([(annotator_to_accuracy[t], t, count) for count, t in counts[-10:]])
top_10 = sorted([t for _, t in counts[-10:]])
In [11]:
for annotator, count in [(annotators[t], count) for count, t in reversed(counts)]:
print(annotator.decode('utf-8'), '\t', count)
Let's throw in just the top 10 annotators #126 and see how it goes. First, I'll upsample the positive examples. I'll count a "negative" example as anything that doesn't have any positive classifications.
In [33]:
top_labels = labels[top_10]
positive_bool = numpy.any(top_labels, axis=0)
positives = numpy.arange(top_labels.shape[1])[positive_bool]
non_positives = numpy.arange(top_labels.shape[1])[~positive_bool]
while positives.shape[0] < non_positives.shape[0]:
new_positives = positives[:]
numpy.random.shuffle(new_positives)
positives = numpy.concatenate([positives, new_positives])
positives = positives[:non_positives.shape[0]]
upsampled = numpy.concatenate([positives, non_positives])
upsampled.sort()
In [34]:
upsampled_train, upsampled_test = sklearn.cross_validation.train_test_split(downsampled)
upsampled_train.sort()
upsampled_test.sort()
Now I can run the algorithm.
In [35]:
print(upsampled_train.shape, upsampled_test.shape)
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features'][upsampled_train, :]
res = passive_crowd.train(x, top_labels.astype(bool)[:, upsampled_train], lr_init=True)
In [40]:
import sklearn.metrics
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
pred = passive_crowd.predict(res[0], res[1], x[upsampled_test, :])
cm = sklearn.metrics.confusion_matrix(norris_labels[upsampled_test], pred)
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print(ba)
print(cm)
In [39]:
import seaborn, matplotlib.pyplot as plt
%matplotlib inline
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
pred = passive_crowd.logistic_regression(res[0], res[1], x[upsampled_test, :])
pos_pred = pred[norris_labels[upsampled_test] == 1]
neg_pred = pred[norris_labels[upsampled_test] == 0]
assert pos_pred.shape[0] + neg_pred.shape[0] == pred.shape[0]
plt.figure(figsize=(10, 5))
seaborn.distplot(pos_pred, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
seaborn.distplot(neg_pred, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
plt.xlim((0, 1))
plt.show()
In [41]:
simulated_norris_labels = numpy.ma.MaskedArray(numpy.tile(norris_labels, (2, 1)), False)
# mask=numpy.random.binomial(1, 0.5, size=(5, 24140)))
In [42]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features'][upsampled_train, :]
res = passive_crowd.train(x, simulated_norris_labels.astype(bool)[:, upsampled_train], lr_init=True)
In [43]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
pred = passive_crowd.predict(res[0], res[1], x[upsampled_test, :])
cm = sklearn.metrics.confusion_matrix(norris_labels[upsampled_test], pred)
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print(ba)
print(cm)
In [44]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
score = res[0].dot(x[upsampled_test, :].T) + res[1]
pos_score = score[norris_labels[upsampled_test] == 1]
neg_score = score[norris_labels[upsampled_test] == 0]
assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
plt.figure(figsize=(10, 5))
seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
plt.show()
In [45]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
score = res[0].dot(x[upsampled_train, :].T) + res[1]
pos_score = score[norris_labels[upsampled_train] == 1]
neg_score = score[norris_labels[upsampled_train] == 0]
assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
plt.figure(figsize=(10, 5))
seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
plt.xlim((-10, 25))
plt.show()
In [26]:
import sklearn.linear_model
lr = sklearn.linear_model.LogisticRegression(C=100)
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
lr.fit(x[downsampled_train, :], norris_labels[downsampled_train])
In [27]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
pred = lr.predict(x[downsampled_test, :])
cm = sklearn.metrics.confusion_matrix(norris_labels[downsampled_test], pred)
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print(ba)
print(cm)
In [28]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
score = lr.decision_function(x[downsampled_test, :])
pos_score = score[norris_labels[downsampled_test] == 1]
neg_score = score[norris_labels[downsampled_test] == 0]
assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
plt.figure(figsize=(10, 5))
seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
plt.show()
In [29]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
score = lr.decision_function(x[downsampled_train, :])
pos_score = score[norris_labels[downsampled_train] == 1]
neg_score = score[norris_labels[downsampled_train] == 0]
assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
plt.figure(figsize=(10, 5))
seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
plt.show()
In [88]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features'][downsampled_train, :]
res = active_crowd_scalar.train(x, top_labels.astype(bool)[:, downsampled_train], lr_init=True)
In [89]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
pred = passive_crowd.predict(res[0], res[1], x[downsampled_test, :])
cm = sklearn.metrics.confusion_matrix(norris_labels[downsampled_test], pred)
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print(ba)
print(cm)
In [98]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
score = res[0].dot(x[downsampled_test, :].T) + res[1]
pos_score = score[norris_labels[downsampled_test] == 1]
neg_score = score[norris_labels[downsampled_test] == 0]
assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
plt.figure(figsize=(10, 5))
seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
plt.xlim((-20, 20))
plt.show()
In [97]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
score = res[0].dot(x[downsampled_train, :].T) + res[1]
pos_score = score[norris_labels[downsampled_train] == 1]
neg_score = score[norris_labels[downsampled_train] == 0]
assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
plt.figure(figsize=(10, 5))
seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
plt.xlim((-20, 20))
plt.show()
In [92]:
res[2]
Out[92]:
First, let's make a covariance matrix.
In [17]:
cov = numpy.ma.cov(labels)
print(cov, cov.shape)
In [18]:
plt.imshow(cov, interpolation='None')
Out[18]:
As expected, lots of unknowns. We'll press on nevertheless!
In [34]:
import sklearn.cluster, collections
In [29]:
kmc = sklearn.cluster.KMeans(5)
kmc.fit(cov)
Out[29]:
In [33]:
clusters = kmc.predict(cov)
Now, we'll do a majority vote over these clusters.
In [37]:
cluster_labels = numpy.ma.MaskedArray(numpy.zeros((5, labels.shape[1])), mask=numpy.zeros((5, labels.shape[1])))
for c in range(5):
for i in range(labels.shape[1]):
this_cluster_labels = labels[clusters == c, i]
# Compute the majority vote.
counter = collections.Counter(this_cluster_labels[~this_cluster_labels.mask])
if counter:
cluster_labels[c, i] = max(counter, key=counter.get)
else:
cluster_labels.mask[c, i] = True
Now let's try a basic logistic regression on each of them.
In [59]:
def balanced_accuracy(y_true, y_pred):
try:
cm = sklearn.metrics.confusion_matrix(y_true[~y_pred.mask], y_pred[~y_pred.mask])
except AttributeError:
cm = sklearn.metrics.confusion_matrix(y_true, y_pred)
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
return ba
In [52]:
i_tr, i_te = sklearn.cross_validation.train_test_split(numpy.arange(labels.shape[1]))
with h5py.File(TRAINING_H5_PATH) as f_h5:
features = f_h5['features'].value
for c in range(5):
labels_ = cluster_labels[c]
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
lr.fit(features[i_tr], labels_[i_tr])
print(balanced_accuracy(norris_labels[i_te], lr.predict(features[i_te])))
Next, we'll put it into the scalar $\eta_t$ implementation.
In [57]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features'][downsampled_train, :]
res = active_crowd_scalar.train(x, cluster_labels.astype(bool)[:, downsampled_train], lr_init=True)
In [60]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
pred = passive_crowd.predict(res[0], res[1], x[downsampled_test, :])
cm = sklearn.metrics.confusion_matrix(norris_labels[downsampled_test], pred)
ba = balanced_accuracy(norris_labels[downsampled_test], pred)
print(cm)
print(ba)
In [63]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
score = res[0].dot(x[downsampled_test, :].T) + res[1]
pos_score = score[norris_labels[downsampled_test] == 1]
neg_score = score[norris_labels[downsampled_test] == 0]
assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
plt.figure(figsize=(10, 5))
seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
plt.xlim((-20, 20))
plt.show()
Now let's try the full algorithm.
In [67]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features'][downsampled_train, :]
res = active_crowd.train(x, cluster_labels.astype(bool)[:, downsampled_train], lr_init=True)
In [68]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
pred = passive_crowd.predict(res[0], res[1], x[downsampled_test, :])
cm = sklearn.metrics.confusion_matrix(norris_labels[downsampled_test], pred)
ba = balanced_accuracy(norris_labels[downsampled_test], pred)
print(cm)
print(ba)
In [78]:
plt.plot(res[2].T)
plt.xscale('log')
plt.legend(range(5))
Out[78]:
In [79]:
print(res[3])
In [94]:
import sklearn.decomposition
pca = sklearn.decomposition.PCA(n_components=10)
In [95]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features'][downsampled_train, :]
pca.fit(x)
In [96]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features'][downsampled_train, :]
res = active_crowd.train(pca.transform(x), labels.astype(bool)[:, downsampled_train], lr_init=True)
In [108]:
seaborn.distplot(res[3])
Out[108]:
In [109]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
pred = passive_crowd.predict(res[0], res[1], pca.transform(x[downsampled_test, :]))
cm = sklearn.metrics.confusion_matrix(norris_labels[downsampled_test], pred)
ba = balanced_accuracy(norris_labels[downsampled_test], pred)
print(cm)
print(ba)
In [112]:
with h5py.File(TRAINING_H5_PATH) as f_h5:
x = f_h5['features']
score = res[0].dot(pca.transform(x[downsampled_test, :]).T) + res[1]
pos_score = score[norris_labels[downsampled_test] == 1]
neg_score = score[norris_labels[downsampled_test] == 0]
assert pos_score.shape[0] + neg_score.shape[0] == score.shape[0]
plt.figure(figsize=(10, 5))
seaborn.distplot(pos_score, rug=True, hist=False, color='green', rug_kws={'alpha': 0.1})
seaborn.distplot(neg_score, rug=True, hist=False, color='red', rug_kws={'alpha': 0.1})
plt.xlim((-100, 100))
plt.show()
In [ ]: