Plots a passive learning curve w.r.t. ATLAS objects. Trained, tested on RGZ, split on compact/resolved. Testing on RGZ instead of Norris because we believe it to be reasonably accurate and it's also a lot bigger; if we want a good idea of how this curve levels out we really want to use as much data as possible. Splitting on compact/resolved because we expect compact to level out a lot faster (possibly very fast indeed).
In [5]:
import astropy.io.ascii as asc, numpy, h5py, sklearn.linear_model, crowdastro.crowd.util, pickle, scipy.spatial
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
with open('/Users/alger/data/Crowdastro/sets_atlas.pkl', 'rb') as f:
atlas_sets = pickle.load(f)
atlas_sets_compact = atlas_sets['RGZ & compact']
atlas_sets_resolved = atlas_sets['RGZ & resolved']
with open('/Users/alger/data/Crowdastro/sets_swire.pkl', 'rb') as f:
swire_sets = pickle.load(f)
swire_sets_compact = swire_sets['RGZ & compact']
swire_sets_resolved = swire_sets['RGZ & resolved']
with h5py.File('/Users/alger/data/Crowdastro/swire.h5') as f:
swire_features = f['features'].value
with h5py.File('/Users/alger/data/Crowdastro/crowdastro-swire.h5') as f:
swire_names = [i.decode('ascii') for i in f['/swire/cdfs/string'].value]
swire_coords = f['/swire/cdfs/numeric'][:, :2]
swire_labels = {i['swire']: i['rgz_label'] for i in asc.read('/Users/alger/data/SWIRE/all_labels.csv')}
table = asc.read('/Users/alger/data/Crowdastro/one-table-to-rule-them-all.tbl')
swire_tree = scipy.spatial.KDTree(swire_coords)
In [13]:
def test_on_atlas_sets(atlas_sets, swire_sets):
subset_sizes = numpy.logspace(numpy.log2(5),
numpy.log2(len(atlas_sets[0][0])),
base=2, num=10)
n_atlas = []
n_swire = []
bas = []
for (train, test), (_, test_swire) in zip(atlas_sets, swire_sets):
key_to_row = {}
for row in table:
key_to_row[row['Key']] = row
for subset_size in subset_sizes:
print(subset_size, end=' ')
# Subsample train.
subset_size = int(subset_size)
train_subset = list(train)
numpy.random.shuffle(train_subset)
train_subset = train_subset[:subset_size]
# Get coords.
ras = [key_to_row[k]['Component RA (Franzen)'] for k in train_subset]
decs = [key_to_row[k]['Component DEC (Franzen)'] for k in train_subset]
coords = list(zip(ras, decs))
# Find nearby SWIREs.
nearby = sorted({int(i) for i in numpy.concatenate(swire_tree.query_ball_point(coords, 1 / 60))})
# Train on the features.
features = swire_features[nearby]
labels = [swire_labels[swire_names[n]] == 'True' for n in nearby]
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced', C=1e10)
lr.fit(features, labels)
# Compute accuracy.
test_labels = [swire_labels[swire_names[n]] == 'True' for n in test_swire]
test_features = swire_features[test_swire]
acc = crowdastro.crowd.util.balanced_accuracy(test_labels, lr.predict(test_features))
n_atlas.append(int(subset_size))
n_swire.append(len(nearby))
bas.append(acc)
print()
return n_atlas, n_swire, bas
In [ ]:
n_atlas, n_swire, bas = test_on_atlas_sets(atlas_sets_compact, swire_sets_compact)
In [9]:
plt.scatter(n_atlas, bas, alpha=0.7)
plt.title('Passive Learning Curve — Compact')
plt.xlabel('Number of radio objects')
plt.ylabel('Balanced accuracy')
plt.xscale('log')
In [14]:
n_atlas_resolved, n_swire_resolved, bas_resolved = test_on_atlas_sets(atlas_sets_resolved, swire_sets_resolved)
In [15]:
plt.scatter(n_atlas_resolved, bas_resolved, alpha=0.7)
plt.title('Passive Learning Curve — Resolved')
plt.xlabel('Number of radio objects')
plt.ylabel('Balanced accuracy')
plt.xscale('log')
In [21]:
plt.scatter(n_atlas_resolved, numpy.array(bas_resolved) * 100, alpha=0.7, color='red', label='Resolved')
plt.scatter(n_atlas, numpy.array(bas) * 100, alpha=0.7, color='green', label='Compact')
plt.title('Accuracy against number of objects in training set')
plt.xlabel('Number of radio objects')
plt.ylabel('Balanced accuracy (%)')
plt.xscale('log')
plt.legend()
Out[21]:
In [44]:
n_atlas_to_acc_compact = {n: [] for n in n_atlas}
for n, ba in zip(n_atlas, bas):
n_atlas_to_acc_compact[n].append(ba)
xs_compact = []
ys_compact = []
yerr_compact = []
for n in sorted(set(n_atlas)):
xs_compact.append(n)
ys_compact.append(numpy.mean(n_atlas_to_acc_compact[n]))
yerr_compact.append(numpy.std(n_atlas_to_acc_compact[n]))
xs_compact = numpy.array(xs_compact)
ys_compact = numpy.array(ys_compact)
yerr_compact = numpy.array(yerr_compact)
ylow_compact = ys_compact - yerr_compact
yhigh_compact = ys_compact + yerr_compact
n_atlas_to_acc_resolved = {n: [] for n in n_atlas_resolved}
for n, ba in zip(n_atlas_resolved, bas_resolved):
n_atlas_to_acc_resolved[n].append(ba)
xs_resolved = []
ys_resolved = []
yerr_resolved = []
for n in sorted(set(n_atlas_resolved)):
xs_resolved.append(n)
ys_resolved.append(numpy.mean(n_atlas_to_acc_resolved[n]))
yerr_resolved.append(numpy.std(n_atlas_to_acc_resolved[n]))
xs_resolved = numpy.array(xs_resolved)
ys_resolved = numpy.array(ys_resolved)
yerr_resolved = numpy.array(yerr_resolved)
ylow_resolved = ys_resolved - yerr_resolved
yhigh_resolved = ys_resolved + yerr_resolved
plt.plot(xs_compact, ys_compact, alpha=1, color='green', label='compact', marker='x')
plt.fill_between(xs_compact, ylow_compact, yhigh_compact, alpha=.2, color='green')
plt.plot(xs_resolved, ys_resolved, alpha=1, color='blue', label='resolved', marker='x')
plt.fill_between(xs_resolved, ylow_resolved, yhigh_resolved, alpha=.2, color='blue')
plt.title('Accuracy against number of objects in training set')
plt.xlabel('Number of radio objects')
plt.ylabel('Balanced accuracy (%)')
plt.xscale('log')
plt.legend()
plt.savefig('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/passive.pdf')
In [ ]: