In [15]:

    
# Some setup.
import h5py, numpy, matplotlib.pyplot as plt, astropy.io.ascii as asc, scipy.spatial, sklearn.model_selection
import crowdastro.crowd.util, sklearn.linear_model, sklearn.ensemble, astropy.table, collections
%matplotlib inline



In [64]:

    
with h5py.File('/Users/alger/data/Crowdastro/swire_11_05_17.h5', 'r') as f:
    swire_features = f['features'].value
with h5py.File('/Users/alger/data/Crowdastro/crowdastro-swire.h5', 'r') as f:
    swire_coords = f['/swire/cdfs/numeric'][:, :2]
    swire_names = [i.decode('ascii') for i in f['/swire/cdfs/string'].value]
table = asc.read('/Users/alger/data/Crowdastro/one-table-to-rule-them-all.tbl')
swire_tree = scipy.spatial.KDTree(swire_coords)
labels = asc.read('/Users/alger/data/SWIRE/all_labels.csv')
atlas_coords = numpy.array([(r['Component RA (Franzen)'], r['Component DEC (Franzen)']) for r in table
                            if r['Component RA (Franzen)']])
rgz_catalogue = asc.read('/Users/alger/data/ATLAS/static_rgz_host_full.csv')  # Overrides all_labels for RGZ.

Generate training sets

We are looking for five different training sets for each subset of the ATLAS-CDFS data:

RGZ & Norris & compact (the "clean" set)
RGZ & Norris & resolved
RGZ & Norris
RGZ compact
RGZ resolved
RGZ

First, let's have a function that takes a set of ATLAS objects and returns five sets of SWIRE objects.



In [32]:

    
def atlas_to_swire(atlas: list, radius: float=1 / 60) -> list:
    # atlas is a list of table Keys.
    # Look up the coordinates of the ATLAS objects.
    atlas = set(atlas)
    ras = [r['Component RA (Franzen)'] for r in table if r['Key'] in atlas]
    decs = [r['Component DEC (Franzen)'] for r in table if r['Key'] in atlas]
    coords = numpy.vstack([ras, decs]).T
    nearby = sorted({int(i)
                     for i in numpy.concatenate(swire_tree.query_ball_point(coords, radius))})
    return nearby

Next, we'll split ATLAS into three (overlapping) subsets:

RGZ
Norris
Compact

From these we can compute all subsets we want to train on.



In [40]:

    
def compact_test(r):
    if not r['Component S (Franzen)']:  # Why does this happen?
        return True

    R = numpy.log(r['Component S (Franzen)'] / r['Component Sp (Franzen)'])
    R_err = numpy.sqrt((r['Component S_ERR (Franzen)'] / r['Component S (Franzen)']) ** 2 +
                       (r['Component Sp_ERR (Franzen)'] / r['Component Sp (Franzen)']) ** 2)
    return R < 2 * R_err



In [41]:

    
rgz = {r['Key'] for r in table if r['Component Zooniverse ID (RGZ)'] and
                                         r['Component ID (Franzen)'] == r['Primary Component ID (RGZ)'] and
                                  r['Component ID (Franzen)']}
norris = {r['Key'] for r in table if r['Component # (Norris)'] and r['Component ID (Franzen)']}
compact = {r['Key'] for r in table if r['Component ID (Franzen)'] and
                                        compact_test(r)}

We can now compute the training sets. We will split CDFS in four. Let's start by finding the min/max RA/dec to get our dividing lines.



In [42]:

    
middle = (numpy.median(atlas_coords[:, 0]), numpy.median(atlas_coords[:, 1]))
middle = (52.8, -28.1)



In [43]:

    
subsets = [
    ('RGZ & Norris & compact', rgz & norris & compact),
    ('RGZ & Norris & resolved', rgz & norris - compact),
    ('RGZ & Norris', rgz & norris),
    ('RGZ & compact', rgz & compact),
    ('RGZ & resolved', rgz - compact),
    ('RGZ', rgz),
]



In [44]:

    
training_testing_atlas_sets = {s:[] for s, _ in subsets}  # Maps subset string -> [(train, test)]

def filter_subset(subset: set, q: int) -> set:
    """Filters subset to just include indices of ATLAS objects in a given quadrant."""
    subset_ = set()
    for s in subset:
        row = table[table['Key'] == s][0]
        coords = row['Component RA (Franzen)'], row['Component DEC (Franzen)']

        if (
                (q == 0 and coords[0] >= middle[0] and coords[1] >= middle[1]) or
                (q == 1 and coords[0] < middle[0] and coords[1] >= middle[1]) or
                (q == 2 and coords[0] < middle[0] and coords[1] < middle[1]) or
                (q == 3 and coords[0] >= middle[0] and coords[1] < middle[1])):
            subset_.add(s)
    return subset_

for subset_str, subset_set in subsets:
    for q in range(4):  # Quadrants.
        test = filter_subset(subset_set, q)
        train = {i for i in subset_set if i not in test}
        print(subset_str, len(train), len(test))
        training_testing_atlas_sets[subset_str].append((train, test))









    



RGZ & Norris & compact 298 151
RGZ & Norris & compact 334 115
RGZ & Norris & compact 378 71
RGZ & Norris & compact 337 112
RGZ & Norris & resolved 69 27
RGZ & Norris & resolved 73 23
RGZ & Norris & resolved 71 25
RGZ & Norris & resolved 75 21
RGZ & Norris 367 178
RGZ & Norris 407 138
RGZ & Norris 449 96
RGZ & Norris 412 133
RGZ & compact 1845 410
RGZ & compact 1596 659
RGZ & compact 1700 555
RGZ & compact 1624 631
RGZ & resolved 162 43
RGZ & resolved 151 54
RGZ & resolved 148 57
RGZ & resolved 154 51
RGZ 2007 453
RGZ 1747 713
RGZ 1848 612
RGZ 1778 682

These can be converted into SWIRE sets.



In [45]:

    
training_testing_swire_sets = {s:[] for s, _ in subsets}  # Maps subset string -> [(train, test)]

for subset_str, subset_set in subsets:
    for train, test in training_testing_atlas_sets[subset_str]:
        train = atlas_to_swire(train)
        test = atlas_to_swire(test)
        print(subset_str, len(set(train) & set(test)), 'out of', len(set(test)), 'overlap')
        train = sorted(set(train) - set(test))
        training_testing_swire_sets[subset_str].append((train, test))









    



RGZ & Norris & compact 0 out of 3285 overlap
RGZ & Norris & compact 0 out of 2602 overlap
RGZ & Norris & compact 15 out of 1806 overlap
RGZ & Norris & compact 15 out of 2561 overlap
RGZ & Norris & resolved 0 out of 726 overlap
RGZ & Norris & resolved 0 out of 582 overlap
RGZ & Norris & resolved 0 out of 729 overlap
RGZ & Norris & resolved 0 out of 592 overlap
RGZ & Norris 0 out of 3880 overlap
RGZ & Norris 8 out of 3110 overlap
RGZ & Norris 23 out of 2435 overlap
RGZ & Norris 15 out of 3107 overlap
RGZ & compact 36 out of 7543 overlap
RGZ & compact 87 out of 11188 overlap
RGZ & compact 129 out of 9351 overlap
RGZ & compact 78 out of 11260 overlap
RGZ & resolved 0 out of 1128 overlap
RGZ & resolved 0 out of 1269 overlap
RGZ & resolved 0 out of 1478 overlap
RGZ & resolved 0 out of 1316 overlap
RGZ 65 out of 8317 overlap
RGZ 143 out of 11913 overlap
RGZ 188 out of 10299 overlap
RGZ 110 out of 11976 overlap

Extracting labels



In [46]:

    
swire_name_to_rgz_label = {}
swire_name_to_norris_label = {}
for row in labels:
    swire_name_to_norris_label[row['swire']] = bool(row['norris_label']) and row['norris_label'] == 'True'
#     swire_name_to_rgz_label[row['swire']] = bool(row['rgz_label']) and row['rgz_label'] == 'True'
for name in swire_names:
    swire_name_to_rgz_label[name] = False
for row in rgz_catalogue:
    swire_name_to_rgz_label[row['SWIRE.designation']] = True

Training LR with Norris labels

We will now train logistic regression on the Norris label set. We will test on both RGZ and Norris, since there are objects that RGZ observes that Norris does not.



In [47]:

    
def test_on_sets(training_testing_swire_sets, name_to_label, Classifier):
    for subset_str, splits in training_testing_swire_sets.items():
        if 'Norris' not in subset_str:
            continue

        rgz_accuracies = []
        norris_accuracies = []
        predictions = []
        for train, test in splits:
            train_features = swire_features[train]
            train_labels = [name_to_label[swire_names[i]] for i in train]

            test_features = swire_features[test]

            lr = Classifier()
            lr.fit(train_features, train_labels)
            
            preds = lr.predict(test_features)
            predictions.append(dict(zip([swire_names[i] for i in test], lr.predict_proba(test_features))))

            if 'Norris' in subset_str:
                test_labels_norris = [swire_name_to_norris_label[swire_names[i]] for i in test]
                norris_accuracies.append(
                    crowdastro.crowd.util.balanced_accuracy(test_labels_norris, preds))
            test_labels_rgz = [swire_name_to_rgz_label[swire_names[i]] for i in test]
            rgz_accuracies.append(
                crowdastro.crowd.util.balanced_accuracy(test_labels_rgz, preds))
        yield subset_str, (rgz_accuracies, norris_accuracies), predictions



In [48]:

    
norris_lr_results = list(test_on_sets(
        training_testing_swire_sets, swire_name_to_norris_label,
        lambda: sklearn.linear_model.LogisticRegression(class_weight='balanced', penalty='l2', C=1e10)))
for subset, (rgz_acc, norris_acc), _ in norris_lr_results:
    print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
    print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))









    



/usr/local/lib/python3.6/site-packages/sklearn/linear_model/base.py:352: RuntimeWarning: overflow encountered in exp
  np.exp(prob, prob)






    



RGZ & Norris & compact on RGZ: (83.95 +- 2.29)%
RGZ & Norris & compact on Norris: (96.39 +- 1.22)%
RGZ & Norris & resolved on RGZ: (80.95 +- 4.80)%
RGZ & Norris & resolved on Norris: (89.59 +- 2.73)%
RGZ & Norris on RGZ: (84.24 +- 1.02)%
RGZ & Norris on Norris: (94.62 +- 1.41)%

Training RF with Norris labels



In [55]:

    
norris_rf_results = list(test_on_sets(
        training_testing_swire_sets, swire_name_to_norris_label,
        lambda: sklearn.ensemble.RandomForestClassifier(class_weight='balanced',
                                                        criterion='entropy',
                                                        min_samples_leaf=45)))

for subset, (rgz_acc, norris_acc), _ in norris_rf_results:
    print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
    print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))









    



RGZ & Norris & compact on RGZ: (79.97 +- 0.21)%
RGZ & Norris & compact on Norris: (97.08 +- 0.90)%
RGZ & Norris & resolved on RGZ: (77.41 +- 4.14)%
RGZ & Norris & resolved on Norris: (94.89 +- 2.00)%
RGZ & Norris on RGZ: (78.43 +- 1.85)%
RGZ & Norris on Norris: (95.77 +- 2.39)%

Training LR with RGZ labels



In [50]:

    
rgz_lr_results = list(test_on_sets(
        training_testing_swire_sets, swire_name_to_rgz_label,
        lambda: sklearn.linear_model.LogisticRegression(class_weight='balanced', penalty='l2', C=1e10)))
for subset, (rgz_acc, norris_acc), _ in rgz_lr_results:
    print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
    print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))









    



RGZ & Norris & compact on RGZ: (86.28 +- 1.70)%
RGZ & Norris & compact on Norris: (94.22 +- 0.59)%
RGZ & Norris & resolved on RGZ: (80.78 +- 3.95)%
RGZ & Norris & resolved on Norris: (85.63 +- 4.33)%
RGZ & Norris on RGZ: (85.97 +- 0.55)%
RGZ & Norris on Norris: (92.52 +- 1.15)%

Training RF with RGZ labels



In [54]:

    
rgz_rf_results = list(test_on_sets(
        training_testing_swire_sets, swire_name_to_rgz_label,
        lambda: sklearn.ensemble.RandomForestClassifier(class_weight='balanced',
                                                        criterion='entropy',
                                                        min_samples_leaf=45)))
for subset, (rgz_acc, norris_acc), _ in rgz_rf_results:
    print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
    print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))









    



RGZ & Norris & compact on RGZ: (92.39 +- 0.67)%
RGZ & Norris & compact on Norris: (94.94 +- 0.37)%
RGZ & Norris & resolved on RGZ: (88.16 +- 2.81)%
RGZ & Norris & resolved on Norris: (92.06 +- 0.38)%
RGZ & Norris on RGZ: (92.36 +- 0.74)%
RGZ & Norris on Norris: (94.69 +- 0.57)%



In [56]:

    
plt.figure(figsize=(10, 4))
for i, (subset_str, _) in enumerate(subsets[:3]):
    plt.subplot(1, 3, i + 1)
    plt.title(subset_str.replace('&', '$\\cap$'))
    plt.xticks([0, 1, 2, 3], ['LR(RGZ)', 'LR(Norris)', 'RF(RGZ)', 'RF(Norris)'], rotation='vertical')
    plt.grid(axis='y', which='major', color='lightgrey', linestyle='-')
    plt.ylim((50, 100))
    plt.xlim((-1, 4))
    
    if i == 1:
        plt.xlabel('Classifier(Training set)', labelpad=20)
    if i == 0:
        plt.ylabel('Balanced accuracy (%)')

    lr_rgz_acc, lr_norris_acc = [numpy.array(res) * 100 for sstr, res, _ in rgz_lr_results if sstr == subset_str][0]
    rf_rgz_acc, rf_norris_acc = [numpy.array(res) * 100 for sstr, res, _ in rgz_rf_results if sstr == subset_str][0]
    xs = [i for j in range(4) for i in [j] * 4]
    ys = list(lr_rgz_acc) + list(lr_norris_acc) + list(rf_rgz_acc) + list(rf_norris_acc)
    plt.scatter(xs, ys, marker='x')
plt.subplots_adjust(wspace=0.4, bottom=0.3)
plt.savefig('/Users/alger/Documents/writing/atlas-ml-ba.pdf')
plt.show()

Table of predictions

We now want to generate a table of predictions. We will first generate a table of SWIRE-based predictions. Each SWIRE object will be assigned a label from each of two classifiers:

LR(Norris)
LR(RGZ)

These are trained on all data points in the training set, i.e., not splitting on compact/resolved. We report the labels from when the given SWIRE object was in a testing quadrant.



In [57]:

    
swire_name_to_rgz_lr_pred = collections.defaultdict(list)
swire_name_to_norris_lr_pred = collections.defaultdict(list)

for subset_str, _, preds in rgz_lr_results:
    for preds_ in preds:
        for name, pred in preds_.items():
            swire_name_to_rgz_lr_pred[name].append(pred)

for subset_str, _, preds in norris_lr_results:
    for preds_ in preds:
        for name, pred in preds_.items():
            swire_name_to_norris_lr_pred[name].append(pred)



In [58]:

    
import astropy.table
names_ = []
rgz_preds_ = []
norris_preds_ = []
for name in swire_names:
    if name not in swire_name_to_rgz_lr_pred:
        continue

    names_.append(name)
    rgz_ = numpy.mean([i[1] for i in swire_name_to_rgz_lr_pred[name]])
    norris_ = numpy.mean([i[1] for i in swire_name_to_norris_lr_pred[name]])
    rgz_preds_.append(rgz_)
    norris_preds_.append(norris_)
predicted_swire_table = astropy.table.Table(data=[names_, rgz_preds_, norris_preds_],
                                            names=['swire', 'lr(rgz)', 'lr(norris)'])
predicted_swire_table.write('/Users/alger/data/Crowdastro/predicted_swire_table_11_05_17.csv', format='csv')
predicted_swire_table.write('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/predicted_swire_table_11_05_17.tex',
                            format='latex')
predicted_swire_table









    Out[58]:




<Table length=12509>

swire lr(rgz) lr(norris)
str26 float64 float64
SWIRE3_J032559.15-284724.2 0.0103425963105 0.0140876973827
SWIRE3_J032559.91-284728.9 0.207774469321 0.0197342707073
SWIRE3_J032600.02-284736.9 0.147822500615 0.0116599079929
SWIRE3_J032600.13-284637.5 0.0609663955718 0.021059589621
SWIRE3_J032600.13-284715.7 0.172903856593 0.0608948984168
SWIRE3_J032600.98-284705.4 0.0556761387892 0.102366961283
SWIRE3_J032601.03-284711.6 0.375365352403 0.168507910493
SWIRE3_J032601.75-284614.5 0.276312390974 0.0449587530623
SWIRE3_J032602.08-284713.1 0.651484553746 0.425380638918
SWIRE3_J032602.36-284711.5 0.499084796642 0.550609670906
... ... ...
SWIRE3_J033555.99-272756.9 0.34616094925 0.00289945384979
SWIRE3_J033556.19-280911.1 0.0368207688138 0.000416389336006
SWIRE3_J033556.25-272734.7 0.0492726967733 0.000566468062323
SWIRE3_J033556.36-272701.2 0.00800182412464 0.000117980019698
SWIRE3_J033556.36-272701.3 0.00663881903206 0.000102254811687
SWIRE3_J033556.51-280917.5 0.130037358638 0.000755857118439
SWIRE3_J033556.63-272759.9 0.0144135431882 0.000263225031745
SWIRE3_J033556.71-272753.0 0.108609448814 0.00081326735078
SWIRE3_J033556.85-272804.3 0.0344172279338 0.000380044647789
SWIRE3_J033557.13-280957.4 0.258034530119 0.00708681070039

Table of cross-identifications

Finally, we want to map these predictions back to the original task. Given an ATLAS object, get all SWIRE objects within 1' of that object. The highest probability host is then named as the host galaxy.



In [59]:

    
# Map Zooniverse ID -> [SWIRE from RGZ]
zid_to_rgz_consensus_swires = collections.defaultdict(list)
zid_to_rgz_consensus_radio = {}
zid_to_rgz_consensus_ir = collections.defaultdict(list)
for row in rgz_catalogue:
    zid_to_rgz_consensus_swires[row['zooniverse_id']].append(row['SWIRE.designation'])
    zid_to_rgz_consensus_ir[row['zooniverse_id']].append(row['consensus.ir_level'])
    zid_to_rgz_consensus_radio[row['zooniverse_id']] = row['consensus.radio_level']



In [60]:

    
zids_ = []
ras_ = []
decs_ = []
rgz_names_ = []
norris_names_ = []
rgz_consensuses_radio_level_ = []
rgz_consensuses_ir_level_ = []
rgz_consensuses_ = []
for row in table:
    if row['Key'] in rgz & norris:
        zid = row['Component Zooniverse ID (RGZ)']
        ra = row['Component RA (Franzen)']
        dec = row['Component DEC (Franzen)']
        nearby = swire_tree.query_ball_point([ra, dec], 1 / 60)
        names = [swire_names[i] for i in nearby]
        if not names:
            print('{} not found'.format(zid))
            continue
        norris_probs = [numpy.mean([i[1] for i in swire_name_to_norris_lr_pred[name]]) for name in names]
        norris_name = names[numpy.argmax(norris_probs)]
        rgz_probs = [numpy.mean([i[1] for i in swire_name_to_rgz_lr_pred[name]]) for name in names]
        rgz_name = names[numpy.argmax(rgz_probs)]
        zids_.append(zid)
        ras_.append(ra)
        decs_.append(dec)
        rgz_names_.append(rgz_name)
        norris_names_.append(norris_name)
        rgz_consensuses_.append(','.join(i for i in zid_to_rgz_consensus_swires.get(zid, [])))
        rgz_consensuses_radio_level_.append(zid_to_rgz_consensus_radio.get(zid))
        rgz_consensuses_ir_level_.append(','.join(str(i) for i in zid_to_rgz_consensus_ir.get(zid, [])))
cross_id_table = astropy.table.Table(data=[zids_, ras_, decs_, rgz_names_, norris_names_,
                                           rgz_consensuses_, rgz_consensuses_radio_level_, rgz_consensuses_ir_level_],
                                     names=['zooniverse_id', 'ra', 'dec', 'lr(rgz)_swire', 'lr(norris)_swire',
                                            'rgz_swire', 'rgz_consensus_radio_level', 'rgz_consensus_ir_level'])
cross_id_table.write('/Users/alger/data/Crowdastro/predicted_cross_ids_table_11_05_17.csv', format='csv')
cross_id_table.write('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/predicted_cross_ids_table_11_05_17.tex', format='latex')
cross_id_table









    



ARG0003spl not found






    Out[60]:




<Table length=544>

zooniverse_id ra dec lr(rgz)_swire lr(norris)_swire rgz_swire rgz_consensus_radio_level rgz_consensus_ir_level
str10 float64 float64 str26 str26 str188 object str74
ARG0003rb2 51.511734 -28.785575 SWIRE3_J032605.76-284711.8 SWIRE3_J032602.36-284711.5 -99 0.4 0.333333333333
ARG0003rfr 51.564555 -28.774847 SWIRE3_J032615.41-284630.7 SWIRE3_J032615.41-284630.7 SWIRE3_J032616.14-284552.9,SWIRE3_J032615.41-284630.7 0.3125 1.0,1.0
ARG0003r8s 51.564799 -28.099955 SWIRE3_J032615.52-280559.8 SWIRE3_J032615.52-280559.8 SWIRE3_J032616.71-280538.6,SWIRE3_J032617.94-280648.2,SWIRE3_J032615.52-280559.8 0.484848484848 0.75,0.555555555556,0.8125
ARG0003r2j 51.572279 -28.119491 SWIRE3_J032615.86-280628.8 SWIRE3_J032617.02-280638.9 SWIRE3_J032617.89-280707.2 0.421052631579 1.0
ARG0003raz 51.604711 -28.152731 SWIRE3_J032625.19-280910.1 SWIRE3_J032625.19-280910.1 SWIRE3_J032624.80-280915.9 0.3 0.333333333333
ARG0003ro4 51.621251 -28.113924 SWIRE3_J032629.13-280650.7 SWIRE3_J032629.13-280650.7 SWIRE3_J032629.13-280650.7,SWIRE3_J032626.74-280636.7 0.357142857143 0.8,1.0
ARG0003r8e 51.623385 -28.681315 SWIRE3_J032629.54-284055.8 SWIRE3_J032629.54-284055.8 SWIRE3_J032629.54-284055.8 0.3 1.0
ARG0003r3w 51.624653 -28.798195 SWIRE3_J032630.12-284751.2 SWIRE3_J032629.81-284754.4 SWIRE3_J032629.81-284754.4 1.0 0.666666666667
ARG0003r55 51.62777 -28.615917 SWIRE3_J032630.64-283658.0 SWIRE3_J032630.64-283658.0 SWIRE3_J032630.64-283658.0,SWIRE3_J032628.56-283744.8 0.354838709677 1.0,0.727272727273
ARG0003rj2 51.644117 -28.339678 SWIRE3_J032631.96-281941.0 SWIRE3_J032634.58-282022.8 SWIRE3_J032630.21-282025.5,SWIRE3_J032634.58-282022.8,SWIRE3_J032631.96-281941.0 0.59375 0.684210526316,0.947368421053,0.473684210526
... ... ... ... ... ... ... ...
ARG0003ra9 53.87272 -28.865327 SWIRE3_J033529.46-285154.5 SWIRE3_J033529.46-285154.5 SWIRE3_J033529.46-285154.5 0.421052631579 0.75
ARG0003rbx 53.875955 -28.185517 SWIRE3_J033530.19-281108.5 SWIRE3_J033530.19-281108.5 SWIRE3_J033530.19-281108.5,SWIRE3_J033530.34-281025.5 0.638888888889 0.826086956522,1.0
ARG0003r1v 53.879392 -27.450616 SWIRE3_J033534.30-272705.7 SWIRE3_J033531.45-272658.6 SWIRE3_J033529.12-272726.2 0.4375 0.285714285714
ARG0003ra7 53.884153 -28.800544 SWIRE3_J033532.19-284801.1 SWIRE3_J033532.19-284801.1 SWIRE3_J033532.19-284801.1 0.222222222222 1.0
ARG0003r59 53.891569 -27.553515 SWIRE3_J033533.90-273310.9 SWIRE3_J033533.90-273310.9 SWIRE3_J033533.90-273310.9,SWIRE3_J033530.61-273323.7 0.529411764706 0.777777777778,0.888888888889
ARG0003r6f 53.904304 -27.575789 SWIRE3_J033537.03-273432.9 SWIRE3_J033537.03-273432.9 SWIRE3_J033537.03-273432.9,SWIRE3_J033532.80-273456.7 0.294117647059 1.0,1.0
ARG0003rdx 53.911449 -28.50367 SWIRE3_J033538.73-283012.9 SWIRE3_J033538.73-283012.9 SWIRE3_J033538.73-283012.9 0.222222222222 1.0
ARG0003r7g 53.94346 -28.634766 SWIRE3_J033546.39-283805.0 SWIRE3_J033546.39-283805.0 SWIRE3_J033549.55-283850.7,SWIRE3_J033546.39-283805.0 0.411764705882 1.0,0.857142857143
ARG0003r1c 53.972284 -27.461299 SWIRE3_J033553.33-272740.4 SWIRE3_J033553.33-272740.4 SWIRE3_J033553.33-272740.4,SWIRE3_J033552.00-272650.6 0.777777777778 0.857142857143,1.0
ARG0003rc9 53.973537 -28.165283 SWIRE3_J033553.56-280952.4 SWIRE3_J033553.56-280952.4 SWIRE3_J033553.56-280952.4,SWIRE3_J033556.74-280906.9,SWIRE3_J033550.25-281026.0 0.424242424242 1.0,1.0,1.0

Export

Finally, let's export all our ATLAS and SWIRE sets.



In [61]:

    
import pickle

with open('/Users/alger/data/Crowdastro/sets_atlas_11_05_17.pkl', 'wb') as f:
    pickle.dump(training_testing_atlas_sets, f)

with open('/Users/alger/data/Crowdastro/sets_swire_11_05_17.pkl', 'wb') as f:
    pickle.dump(training_testing_swire_sets, f)

Export for Full Training

We need to export the SWIRE features and labels. In principle we can reconstruct our features etc. later, so we don't need to export the names, but we will anyway.



In [62]:

    
def export(training_testing_swire_sets, filename):
    with h5py.File(filename, 'w') as f:
        # Export features.
        f.create_dataset('features', data=swire_features)
        # Export names.
        assert all(len(s) <= 26 for s in swire_names)
        f.create_dataset('names', data=[i.encode('ascii') for i in swire_names], dtype='<S26')
        # Export labels.
        f.create_dataset('norris_labels', data=numpy.array([
            swire_name_to_norris_label[n] for n in swire_names], dtype=bool))
        f.create_dataset('rgz_labels', data=numpy.array([
            swire_name_to_rgz_label[n] for n in swire_names], dtype=bool))
        # Export train/test sets.
        sets = f.create_group('sets')
        for subset_str, splits in training_testing_swire_sets.items():
            subset = sets.create_group(subset_str)
            train_bool = numpy.zeros((len(splits), len(swire_names)), dtype=bool)
            test_bool = numpy.zeros((len(splits), len(swire_names)), dtype=bool)
            for i, (train, test) in enumerate(splits):
                train_bool[i, train] = True
                test_bool[i, test] = True
            subset.create_dataset('train', data=train_bool)
            subset.create_dataset('test', data=test_bool)



In [63]:

    
export(training_testing_swire_sets, '/Users/alger/data/Crowdastro/all_training_data_11_05_17.h5')



In [68]:

    
swire_features.shape[1] - 1024









    Out[68]:





10



In [ ]:

swire	lr(rgz)	lr(norris)
str26	float64	float64
SWIRE3_J032559.15-284724.2	0.0103425963105	0.0140876973827
SWIRE3_J032559.91-284728.9	0.207774469321	0.0197342707073
SWIRE3_J032600.02-284736.9	0.147822500615	0.0116599079929
SWIRE3_J032600.13-284637.5	0.0609663955718	0.021059589621
SWIRE3_J032600.13-284715.7	0.172903856593	0.0608948984168
SWIRE3_J032600.98-284705.4	0.0556761387892	0.102366961283
SWIRE3_J032601.03-284711.6	0.375365352403	0.168507910493
SWIRE3_J032601.75-284614.5	0.276312390974	0.0449587530623
SWIRE3_J032602.08-284713.1	0.651484553746	0.425380638918
SWIRE3_J032602.36-284711.5	0.499084796642	0.550609670906
...	...	...
SWIRE3_J033555.99-272756.9	0.34616094925	0.00289945384979
SWIRE3_J033556.19-280911.1	0.0368207688138	0.000416389336006
SWIRE3_J033556.25-272734.7	0.0492726967733	0.000566468062323
SWIRE3_J033556.36-272701.2	0.00800182412464	0.000117980019698
SWIRE3_J033556.36-272701.3	0.00663881903206	0.000102254811687
SWIRE3_J033556.51-280917.5	0.130037358638	0.000755857118439
SWIRE3_J033556.63-272759.9	0.0144135431882	0.000263225031745
SWIRE3_J033556.71-272753.0	0.108609448814	0.00081326735078
SWIRE3_J033556.85-272804.3	0.0344172279338	0.000380044647789
SWIRE3_J033557.13-280957.4	0.258034530119	0.00708681070039

zooniverse_id	ra	dec	lr(rgz)_swire	lr(norris)_swire	rgz_swire	rgz_consensus_radio_level	rgz_consensus_ir_level
str10	float64	float64	str26	str26	str188	object	str74
ARG0003rb2	51.511734	-28.785575	SWIRE3_J032605.76-284711.8	SWIRE3_J032602.36-284711.5	-99	0.4	0.333333333333
ARG0003rfr	51.564555	-28.774847	SWIRE3_J032615.41-284630.7	SWIRE3_J032615.41-284630.7	SWIRE3_J032616.14-284552.9,SWIRE3_J032615.41-284630.7	0.3125	1.0,1.0
ARG0003r8s	51.564799	-28.099955	SWIRE3_J032615.52-280559.8	SWIRE3_J032615.52-280559.8	SWIRE3_J032616.71-280538.6,SWIRE3_J032617.94-280648.2,SWIRE3_J032615.52-280559.8	0.484848484848	0.75,0.555555555556,0.8125
ARG0003r2j	51.572279	-28.119491	SWIRE3_J032615.86-280628.8	SWIRE3_J032617.02-280638.9	SWIRE3_J032617.89-280707.2	0.421052631579	1.0
ARG0003raz	51.604711	-28.152731	SWIRE3_J032625.19-280910.1	SWIRE3_J032625.19-280910.1	SWIRE3_J032624.80-280915.9	0.3	0.333333333333
ARG0003ro4	51.621251	-28.113924	SWIRE3_J032629.13-280650.7	SWIRE3_J032629.13-280650.7	SWIRE3_J032629.13-280650.7,SWIRE3_J032626.74-280636.7	0.357142857143	0.8,1.0
ARG0003r8e	51.623385	-28.681315	SWIRE3_J032629.54-284055.8	SWIRE3_J032629.54-284055.8	SWIRE3_J032629.54-284055.8	0.3	1.0
ARG0003r3w	51.624653	-28.798195	SWIRE3_J032630.12-284751.2	SWIRE3_J032629.81-284754.4	SWIRE3_J032629.81-284754.4	1.0	0.666666666667
ARG0003r55	51.62777	-28.615917	SWIRE3_J032630.64-283658.0	SWIRE3_J032630.64-283658.0	SWIRE3_J032630.64-283658.0,SWIRE3_J032628.56-283744.8	0.354838709677	1.0,0.727272727273
ARG0003rj2	51.644117	-28.339678	SWIRE3_J032631.96-281941.0	SWIRE3_J032634.58-282022.8	SWIRE3_J032630.21-282025.5,SWIRE3_J032634.58-282022.8,SWIRE3_J032631.96-281941.0	0.59375	0.684210526316,0.947368421053,0.473684210526
...	...	...	...	...	...	...	...
ARG0003ra9	53.87272	-28.865327	SWIRE3_J033529.46-285154.5	SWIRE3_J033529.46-285154.5	SWIRE3_J033529.46-285154.5	0.421052631579	0.75
ARG0003rbx	53.875955	-28.185517	SWIRE3_J033530.19-281108.5	SWIRE3_J033530.19-281108.5	SWIRE3_J033530.19-281108.5,SWIRE3_J033530.34-281025.5	0.638888888889	0.826086956522,1.0
ARG0003r1v	53.879392	-27.450616	SWIRE3_J033534.30-272705.7	SWIRE3_J033531.45-272658.6	SWIRE3_J033529.12-272726.2	0.4375	0.285714285714
ARG0003ra7	53.884153	-28.800544	SWIRE3_J033532.19-284801.1	SWIRE3_J033532.19-284801.1	SWIRE3_J033532.19-284801.1	0.222222222222	1.0
ARG0003r59	53.891569	-27.553515	SWIRE3_J033533.90-273310.9	SWIRE3_J033533.90-273310.9	SWIRE3_J033533.90-273310.9,SWIRE3_J033530.61-273323.7	0.529411764706	0.777777777778,0.888888888889
ARG0003r6f	53.904304	-27.575789	SWIRE3_J033537.03-273432.9	SWIRE3_J033537.03-273432.9	SWIRE3_J033537.03-273432.9,SWIRE3_J033532.80-273456.7	0.294117647059	1.0,1.0
ARG0003rdx	53.911449	-28.50367	SWIRE3_J033538.73-283012.9	SWIRE3_J033538.73-283012.9	SWIRE3_J033538.73-283012.9	0.222222222222	1.0
ARG0003r7g	53.94346	-28.634766	SWIRE3_J033546.39-283805.0	SWIRE3_J033546.39-283805.0	SWIRE3_J033549.55-283850.7,SWIRE3_J033546.39-283805.0	0.411764705882	1.0,0.857142857143
ARG0003r1c	53.972284	-27.461299	SWIRE3_J033553.33-272740.4	SWIRE3_J033553.33-272740.4	SWIRE3_J033553.33-272740.4,SWIRE3_J033552.00-272650.6	0.777777777778	0.857142857143,1.0
ARG0003rc9	53.973537	-28.165283	SWIRE3_J033553.56-280952.4	SWIRE3_J033553.56-280952.4	SWIRE3_J033553.56-280952.4,SWIRE3_J033556.74-280906.9,SWIRE3_J033550.25-281026.0	0.424242424242	1.0,1.0,1.0