In [15]:
# Some setup.
import h5py, numpy, matplotlib.pyplot as plt, astropy.io.ascii as asc, scipy.spatial, sklearn.model_selection
import crowdastro.crowd.util, sklearn.linear_model, sklearn.ensemble, astropy.table, collections
%matplotlib inline
In [64]:
with h5py.File('/Users/alger/data/Crowdastro/swire_11_05_17.h5', 'r') as f:
swire_features = f['features'].value
with h5py.File('/Users/alger/data/Crowdastro/crowdastro-swire.h5', 'r') as f:
swire_coords = f['/swire/cdfs/numeric'][:, :2]
swire_names = [i.decode('ascii') for i in f['/swire/cdfs/string'].value]
table = asc.read('/Users/alger/data/Crowdastro/one-table-to-rule-them-all.tbl')
swire_tree = scipy.spatial.KDTree(swire_coords)
labels = asc.read('/Users/alger/data/SWIRE/all_labels.csv')
atlas_coords = numpy.array([(r['Component RA (Franzen)'], r['Component DEC (Franzen)']) for r in table
if r['Component RA (Franzen)']])
rgz_catalogue = asc.read('/Users/alger/data/ATLAS/static_rgz_host_full.csv') # Overrides all_labels for RGZ.
We are looking for five different training sets for each subset of the ATLAS-CDFS data:
First, let's have a function that takes a set of ATLAS objects and returns five sets of SWIRE objects.
In [32]:
def atlas_to_swire(atlas: list, radius: float=1 / 60) -> list:
# atlas is a list of table Keys.
# Look up the coordinates of the ATLAS objects.
atlas = set(atlas)
ras = [r['Component RA (Franzen)'] for r in table if r['Key'] in atlas]
decs = [r['Component DEC (Franzen)'] for r in table if r['Key'] in atlas]
coords = numpy.vstack([ras, decs]).T
nearby = sorted({int(i)
for i in numpy.concatenate(swire_tree.query_ball_point(coords, radius))})
return nearby
Next, we'll split ATLAS into three (overlapping) subsets:
From these we can compute all subsets we want to train on.
In [40]:
def compact_test(r):
if not r['Component S (Franzen)']: # Why does this happen?
return True
R = numpy.log(r['Component S (Franzen)'] / r['Component Sp (Franzen)'])
R_err = numpy.sqrt((r['Component S_ERR (Franzen)'] / r['Component S (Franzen)']) ** 2 +
(r['Component Sp_ERR (Franzen)'] / r['Component Sp (Franzen)']) ** 2)
return R < 2 * R_err
In [41]:
rgz = {r['Key'] for r in table if r['Component Zooniverse ID (RGZ)'] and
r['Component ID (Franzen)'] == r['Primary Component ID (RGZ)'] and
r['Component ID (Franzen)']}
norris = {r['Key'] for r in table if r['Component # (Norris)'] and r['Component ID (Franzen)']}
compact = {r['Key'] for r in table if r['Component ID (Franzen)'] and
compact_test(r)}
We can now compute the training sets. We will split CDFS in four. Let's start by finding the min/max RA/dec to get our dividing lines.
In [42]:
middle = (numpy.median(atlas_coords[:, 0]), numpy.median(atlas_coords[:, 1]))
middle = (52.8, -28.1)
In [43]:
subsets = [
('RGZ & Norris & compact', rgz & norris & compact),
('RGZ & Norris & resolved', rgz & norris - compact),
('RGZ & Norris', rgz & norris),
('RGZ & compact', rgz & compact),
('RGZ & resolved', rgz - compact),
('RGZ', rgz),
]
In [44]:
training_testing_atlas_sets = {s:[] for s, _ in subsets} # Maps subset string -> [(train, test)]
def filter_subset(subset: set, q: int) -> set:
"""Filters subset to just include indices of ATLAS objects in a given quadrant."""
subset_ = set()
for s in subset:
row = table[table['Key'] == s][0]
coords = row['Component RA (Franzen)'], row['Component DEC (Franzen)']
if (
(q == 0 and coords[0] >= middle[0] and coords[1] >= middle[1]) or
(q == 1 and coords[0] < middle[0] and coords[1] >= middle[1]) or
(q == 2 and coords[0] < middle[0] and coords[1] < middle[1]) or
(q == 3 and coords[0] >= middle[0] and coords[1] < middle[1])):
subset_.add(s)
return subset_
for subset_str, subset_set in subsets:
for q in range(4): # Quadrants.
test = filter_subset(subset_set, q)
train = {i for i in subset_set if i not in test}
print(subset_str, len(train), len(test))
training_testing_atlas_sets[subset_str].append((train, test))
These can be converted into SWIRE sets.
In [45]:
training_testing_swire_sets = {s:[] for s, _ in subsets} # Maps subset string -> [(train, test)]
for subset_str, subset_set in subsets:
for train, test in training_testing_atlas_sets[subset_str]:
train = atlas_to_swire(train)
test = atlas_to_swire(test)
print(subset_str, len(set(train) & set(test)), 'out of', len(set(test)), 'overlap')
train = sorted(set(train) - set(test))
training_testing_swire_sets[subset_str].append((train, test))
In [46]:
swire_name_to_rgz_label = {}
swire_name_to_norris_label = {}
for row in labels:
swire_name_to_norris_label[row['swire']] = bool(row['norris_label']) and row['norris_label'] == 'True'
# swire_name_to_rgz_label[row['swire']] = bool(row['rgz_label']) and row['rgz_label'] == 'True'
for name in swire_names:
swire_name_to_rgz_label[name] = False
for row in rgz_catalogue:
swire_name_to_rgz_label[row['SWIRE.designation']] = True
In [47]:
def test_on_sets(training_testing_swire_sets, name_to_label, Classifier):
for subset_str, splits in training_testing_swire_sets.items():
if 'Norris' not in subset_str:
continue
rgz_accuracies = []
norris_accuracies = []
predictions = []
for train, test in splits:
train_features = swire_features[train]
train_labels = [name_to_label[swire_names[i]] for i in train]
test_features = swire_features[test]
lr = Classifier()
lr.fit(train_features, train_labels)
preds = lr.predict(test_features)
predictions.append(dict(zip([swire_names[i] for i in test], lr.predict_proba(test_features))))
if 'Norris' in subset_str:
test_labels_norris = [swire_name_to_norris_label[swire_names[i]] for i in test]
norris_accuracies.append(
crowdastro.crowd.util.balanced_accuracy(test_labels_norris, preds))
test_labels_rgz = [swire_name_to_rgz_label[swire_names[i]] for i in test]
rgz_accuracies.append(
crowdastro.crowd.util.balanced_accuracy(test_labels_rgz, preds))
yield subset_str, (rgz_accuracies, norris_accuracies), predictions
In [48]:
norris_lr_results = list(test_on_sets(
training_testing_swire_sets, swire_name_to_norris_label,
lambda: sklearn.linear_model.LogisticRegression(class_weight='balanced', penalty='l2', C=1e10)))
for subset, (rgz_acc, norris_acc), _ in norris_lr_results:
print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))
In [55]:
norris_rf_results = list(test_on_sets(
training_testing_swire_sets, swire_name_to_norris_label,
lambda: sklearn.ensemble.RandomForestClassifier(class_weight='balanced',
criterion='entropy',
min_samples_leaf=45)))
for subset, (rgz_acc, norris_acc), _ in norris_rf_results:
print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))
In [50]:
rgz_lr_results = list(test_on_sets(
training_testing_swire_sets, swire_name_to_rgz_label,
lambda: sklearn.linear_model.LogisticRegression(class_weight='balanced', penalty='l2', C=1e10)))
for subset, (rgz_acc, norris_acc), _ in rgz_lr_results:
print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))
In [54]:
rgz_rf_results = list(test_on_sets(
training_testing_swire_sets, swire_name_to_rgz_label,
lambda: sklearn.ensemble.RandomForestClassifier(class_weight='balanced',
criterion='entropy',
min_samples_leaf=45)))
for subset, (rgz_acc, norris_acc), _ in rgz_rf_results:
print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))
In [56]:
plt.figure(figsize=(10, 4))
for i, (subset_str, _) in enumerate(subsets[:3]):
plt.subplot(1, 3, i + 1)
plt.title(subset_str.replace('&', '$\\cap$'))
plt.xticks([0, 1, 2, 3], ['LR(RGZ)', 'LR(Norris)', 'RF(RGZ)', 'RF(Norris)'], rotation='vertical')
plt.grid(axis='y', which='major', color='lightgrey', linestyle='-')
plt.ylim((50, 100))
plt.xlim((-1, 4))
if i == 1:
plt.xlabel('Classifier(Training set)', labelpad=20)
if i == 0:
plt.ylabel('Balanced accuracy (%)')
lr_rgz_acc, lr_norris_acc = [numpy.array(res) * 100 for sstr, res, _ in rgz_lr_results if sstr == subset_str][0]
rf_rgz_acc, rf_norris_acc = [numpy.array(res) * 100 for sstr, res, _ in rgz_rf_results if sstr == subset_str][0]
xs = [i for j in range(4) for i in [j] * 4]
ys = list(lr_rgz_acc) + list(lr_norris_acc) + list(rf_rgz_acc) + list(rf_norris_acc)
plt.scatter(xs, ys, marker='x')
plt.subplots_adjust(wspace=0.4, bottom=0.3)
plt.savefig('/Users/alger/Documents/writing/atlas-ml-ba.pdf')
plt.show()
We now want to generate a table of predictions. We will first generate a table of SWIRE-based predictions. Each SWIRE object will be assigned a label from each of two classifiers:
These are trained on all data points in the training set, i.e., not splitting on compact/resolved. We report the labels from when the given SWIRE object was in a testing quadrant.
In [57]:
swire_name_to_rgz_lr_pred = collections.defaultdict(list)
swire_name_to_norris_lr_pred = collections.defaultdict(list)
for subset_str, _, preds in rgz_lr_results:
for preds_ in preds:
for name, pred in preds_.items():
swire_name_to_rgz_lr_pred[name].append(pred)
for subset_str, _, preds in norris_lr_results:
for preds_ in preds:
for name, pred in preds_.items():
swire_name_to_norris_lr_pred[name].append(pred)
In [58]:
import astropy.table
names_ = []
rgz_preds_ = []
norris_preds_ = []
for name in swire_names:
if name not in swire_name_to_rgz_lr_pred:
continue
names_.append(name)
rgz_ = numpy.mean([i[1] for i in swire_name_to_rgz_lr_pred[name]])
norris_ = numpy.mean([i[1] for i in swire_name_to_norris_lr_pred[name]])
rgz_preds_.append(rgz_)
norris_preds_.append(norris_)
predicted_swire_table = astropy.table.Table(data=[names_, rgz_preds_, norris_preds_],
names=['swire', 'lr(rgz)', 'lr(norris)'])
predicted_swire_table.write('/Users/alger/data/Crowdastro/predicted_swire_table_11_05_17.csv', format='csv')
predicted_swire_table.write('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/predicted_swire_table_11_05_17.tex',
format='latex')
predicted_swire_table
Out[58]:
In [59]:
# Map Zooniverse ID -> [SWIRE from RGZ]
zid_to_rgz_consensus_swires = collections.defaultdict(list)
zid_to_rgz_consensus_radio = {}
zid_to_rgz_consensus_ir = collections.defaultdict(list)
for row in rgz_catalogue:
zid_to_rgz_consensus_swires[row['zooniverse_id']].append(row['SWIRE.designation'])
zid_to_rgz_consensus_ir[row['zooniverse_id']].append(row['consensus.ir_level'])
zid_to_rgz_consensus_radio[row['zooniverse_id']] = row['consensus.radio_level']
In [60]:
zids_ = []
ras_ = []
decs_ = []
rgz_names_ = []
norris_names_ = []
rgz_consensuses_radio_level_ = []
rgz_consensuses_ir_level_ = []
rgz_consensuses_ = []
for row in table:
if row['Key'] in rgz & norris:
zid = row['Component Zooniverse ID (RGZ)']
ra = row['Component RA (Franzen)']
dec = row['Component DEC (Franzen)']
nearby = swire_tree.query_ball_point([ra, dec], 1 / 60)
names = [swire_names[i] for i in nearby]
if not names:
print('{} not found'.format(zid))
continue
norris_probs = [numpy.mean([i[1] for i in swire_name_to_norris_lr_pred[name]]) for name in names]
norris_name = names[numpy.argmax(norris_probs)]
rgz_probs = [numpy.mean([i[1] for i in swire_name_to_rgz_lr_pred[name]]) for name in names]
rgz_name = names[numpy.argmax(rgz_probs)]
zids_.append(zid)
ras_.append(ra)
decs_.append(dec)
rgz_names_.append(rgz_name)
norris_names_.append(norris_name)
rgz_consensuses_.append(','.join(i for i in zid_to_rgz_consensus_swires.get(zid, [])))
rgz_consensuses_radio_level_.append(zid_to_rgz_consensus_radio.get(zid))
rgz_consensuses_ir_level_.append(','.join(str(i) for i in zid_to_rgz_consensus_ir.get(zid, [])))
cross_id_table = astropy.table.Table(data=[zids_, ras_, decs_, rgz_names_, norris_names_,
rgz_consensuses_, rgz_consensuses_radio_level_, rgz_consensuses_ir_level_],
names=['zooniverse_id', 'ra', 'dec', 'lr(rgz)_swire', 'lr(norris)_swire',
'rgz_swire', 'rgz_consensus_radio_level', 'rgz_consensus_ir_level'])
cross_id_table.write('/Users/alger/data/Crowdastro/predicted_cross_ids_table_11_05_17.csv', format='csv')
cross_id_table.write('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/predicted_cross_ids_table_11_05_17.tex', format='latex')
cross_id_table
Out[60]:
In [61]:
import pickle
with open('/Users/alger/data/Crowdastro/sets_atlas_11_05_17.pkl', 'wb') as f:
pickle.dump(training_testing_atlas_sets, f)
with open('/Users/alger/data/Crowdastro/sets_swire_11_05_17.pkl', 'wb') as f:
pickle.dump(training_testing_swire_sets, f)
In [62]:
def export(training_testing_swire_sets, filename):
with h5py.File(filename, 'w') as f:
# Export features.
f.create_dataset('features', data=swire_features)
# Export names.
assert all(len(s) <= 26 for s in swire_names)
f.create_dataset('names', data=[i.encode('ascii') for i in swire_names], dtype='<S26')
# Export labels.
f.create_dataset('norris_labels', data=numpy.array([
swire_name_to_norris_label[n] for n in swire_names], dtype=bool))
f.create_dataset('rgz_labels', data=numpy.array([
swire_name_to_rgz_label[n] for n in swire_names], dtype=bool))
# Export train/test sets.
sets = f.create_group('sets')
for subset_str, splits in training_testing_swire_sets.items():
subset = sets.create_group(subset_str)
train_bool = numpy.zeros((len(splits), len(swire_names)), dtype=bool)
test_bool = numpy.zeros((len(splits), len(swire_names)), dtype=bool)
for i, (train, test) in enumerate(splits):
train_bool[i, train] = True
test_bool[i, test] = True
subset.create_dataset('train', data=train_bool)
subset.create_dataset('test', data=test_bool)
In [63]:
export(training_testing_swire_sets, '/Users/alger/data/Crowdastro/all_training_data_11_05_17.h5')
In [68]:
swire_features.shape[1] - 1024
Out[68]:
In [ ]: