In [15]:
# Some setup.
import h5py, numpy, matplotlib.pyplot as plt, astropy.io.ascii as asc, scipy.spatial, sklearn.model_selection
import crowdastro.crowd.util, sklearn.linear_model, sklearn.ensemble, astropy.table, collections
%matplotlib inline

In [64]:
with h5py.File('/Users/alger/data/Crowdastro/swire_11_05_17.h5', 'r') as f:
    swire_features = f['features'].value
with h5py.File('/Users/alger/data/Crowdastro/crowdastro-swire.h5', 'r') as f:
    swire_coords = f['/swire/cdfs/numeric'][:, :2]
    swire_names = [i.decode('ascii') for i in f['/swire/cdfs/string'].value]
table = asc.read('/Users/alger/data/Crowdastro/one-table-to-rule-them-all.tbl')
swire_tree = scipy.spatial.KDTree(swire_coords)
labels = asc.read('/Users/alger/data/SWIRE/all_labels.csv')
atlas_coords = numpy.array([(r['Component RA (Franzen)'], r['Component DEC (Franzen)']) for r in table
                            if r['Component RA (Franzen)']])
rgz_catalogue = asc.read('/Users/alger/data/ATLAS/static_rgz_host_full.csv')  # Overrides all_labels for RGZ.

Generate training sets

We are looking for five different training sets for each subset of the ATLAS-CDFS data:

  • RGZ & Norris & compact (the "clean" set)
  • RGZ & Norris & resolved
  • RGZ & Norris
  • RGZ compact
  • RGZ resolved
  • RGZ

First, let's have a function that takes a set of ATLAS objects and returns five sets of SWIRE objects.


In [32]:
def atlas_to_swire(atlas: list, radius: float=1 / 60) -> list:
    # atlas is a list of table Keys.
    # Look up the coordinates of the ATLAS objects.
    atlas = set(atlas)
    ras = [r['Component RA (Franzen)'] for r in table if r['Key'] in atlas]
    decs = [r['Component DEC (Franzen)'] for r in table if r['Key'] in atlas]
    coords = numpy.vstack([ras, decs]).T
    nearby = sorted({int(i)
                     for i in numpy.concatenate(swire_tree.query_ball_point(coords, radius))})
    return nearby

Next, we'll split ATLAS into three (overlapping) subsets:

  • RGZ
  • Norris
  • Compact

From these we can compute all subsets we want to train on.


In [40]:
def compact_test(r):
    if not r['Component S (Franzen)']:  # Why does this happen?
        return True

    R = numpy.log(r['Component S (Franzen)'] / r['Component Sp (Franzen)'])
    R_err = numpy.sqrt((r['Component S_ERR (Franzen)'] / r['Component S (Franzen)']) ** 2 +
                       (r['Component Sp_ERR (Franzen)'] / r['Component Sp (Franzen)']) ** 2)
    return R < 2 * R_err

In [41]:
rgz = {r['Key'] for r in table if r['Component Zooniverse ID (RGZ)'] and
                                         r['Component ID (Franzen)'] == r['Primary Component ID (RGZ)'] and
                                  r['Component ID (Franzen)']}
norris = {r['Key'] for r in table if r['Component # (Norris)'] and r['Component ID (Franzen)']}
compact = {r['Key'] for r in table if r['Component ID (Franzen)'] and
                                        compact_test(r)}

We can now compute the training sets. We will split CDFS in four. Let's start by finding the min/max RA/dec to get our dividing lines.


In [42]:
middle = (numpy.median(atlas_coords[:, 0]), numpy.median(atlas_coords[:, 1]))
middle = (52.8, -28.1)

In [43]:
subsets = [
    ('RGZ & Norris & compact', rgz & norris & compact),
    ('RGZ & Norris & resolved', rgz & norris - compact),
    ('RGZ & Norris', rgz & norris),
    ('RGZ & compact', rgz & compact),
    ('RGZ & resolved', rgz - compact),
    ('RGZ', rgz),
]

In [44]:
training_testing_atlas_sets = {s:[] for s, _ in subsets}  # Maps subset string -> [(train, test)]

def filter_subset(subset: set, q: int) -> set:
    """Filters subset to just include indices of ATLAS objects in a given quadrant."""
    subset_ = set()
    for s in subset:
        row = table[table['Key'] == s][0]
        coords = row['Component RA (Franzen)'], row['Component DEC (Franzen)']

        if (
                (q == 0 and coords[0] >= middle[0] and coords[1] >= middle[1]) or
                (q == 1 and coords[0] < middle[0] and coords[1] >= middle[1]) or
                (q == 2 and coords[0] < middle[0] and coords[1] < middle[1]) or
                (q == 3 and coords[0] >= middle[0] and coords[1] < middle[1])):
            subset_.add(s)
    return subset_

for subset_str, subset_set in subsets:
    for q in range(4):  # Quadrants.
        test = filter_subset(subset_set, q)
        train = {i for i in subset_set if i not in test}
        print(subset_str, len(train), len(test))
        training_testing_atlas_sets[subset_str].append((train, test))


RGZ & Norris & compact 298 151
RGZ & Norris & compact 334 115
RGZ & Norris & compact 378 71
RGZ & Norris & compact 337 112
RGZ & Norris & resolved 69 27
RGZ & Norris & resolved 73 23
RGZ & Norris & resolved 71 25
RGZ & Norris & resolved 75 21
RGZ & Norris 367 178
RGZ & Norris 407 138
RGZ & Norris 449 96
RGZ & Norris 412 133
RGZ & compact 1845 410
RGZ & compact 1596 659
RGZ & compact 1700 555
RGZ & compact 1624 631
RGZ & resolved 162 43
RGZ & resolved 151 54
RGZ & resolved 148 57
RGZ & resolved 154 51
RGZ 2007 453
RGZ 1747 713
RGZ 1848 612
RGZ 1778 682

These can be converted into SWIRE sets.


In [45]:
training_testing_swire_sets = {s:[] for s, _ in subsets}  # Maps subset string -> [(train, test)]

for subset_str, subset_set in subsets:
    for train, test in training_testing_atlas_sets[subset_str]:
        train = atlas_to_swire(train)
        test = atlas_to_swire(test)
        print(subset_str, len(set(train) & set(test)), 'out of', len(set(test)), 'overlap')
        train = sorted(set(train) - set(test))
        training_testing_swire_sets[subset_str].append((train, test))


RGZ & Norris & compact 0 out of 3285 overlap
RGZ & Norris & compact 0 out of 2602 overlap
RGZ & Norris & compact 15 out of 1806 overlap
RGZ & Norris & compact 15 out of 2561 overlap
RGZ & Norris & resolved 0 out of 726 overlap
RGZ & Norris & resolved 0 out of 582 overlap
RGZ & Norris & resolved 0 out of 729 overlap
RGZ & Norris & resolved 0 out of 592 overlap
RGZ & Norris 0 out of 3880 overlap
RGZ & Norris 8 out of 3110 overlap
RGZ & Norris 23 out of 2435 overlap
RGZ & Norris 15 out of 3107 overlap
RGZ & compact 36 out of 7543 overlap
RGZ & compact 87 out of 11188 overlap
RGZ & compact 129 out of 9351 overlap
RGZ & compact 78 out of 11260 overlap
RGZ & resolved 0 out of 1128 overlap
RGZ & resolved 0 out of 1269 overlap
RGZ & resolved 0 out of 1478 overlap
RGZ & resolved 0 out of 1316 overlap
RGZ 65 out of 8317 overlap
RGZ 143 out of 11913 overlap
RGZ 188 out of 10299 overlap
RGZ 110 out of 11976 overlap

Extracting labels


In [46]:
swire_name_to_rgz_label = {}
swire_name_to_norris_label = {}
for row in labels:
    swire_name_to_norris_label[row['swire']] = bool(row['norris_label']) and row['norris_label'] == 'True'
#     swire_name_to_rgz_label[row['swire']] = bool(row['rgz_label']) and row['rgz_label'] == 'True'
for name in swire_names:
    swire_name_to_rgz_label[name] = False
for row in rgz_catalogue:
    swire_name_to_rgz_label[row['SWIRE.designation']] = True

Training LR with Norris labels

We will now train logistic regression on the Norris label set. We will test on both RGZ and Norris, since there are objects that RGZ observes that Norris does not.


In [47]:
def test_on_sets(training_testing_swire_sets, name_to_label, Classifier):
    for subset_str, splits in training_testing_swire_sets.items():
        if 'Norris' not in subset_str:
            continue

        rgz_accuracies = []
        norris_accuracies = []
        predictions = []
        for train, test in splits:
            train_features = swire_features[train]
            train_labels = [name_to_label[swire_names[i]] for i in train]

            test_features = swire_features[test]

            lr = Classifier()
            lr.fit(train_features, train_labels)
            
            preds = lr.predict(test_features)
            predictions.append(dict(zip([swire_names[i] for i in test], lr.predict_proba(test_features))))

            if 'Norris' in subset_str:
                test_labels_norris = [swire_name_to_norris_label[swire_names[i]] for i in test]
                norris_accuracies.append(
                    crowdastro.crowd.util.balanced_accuracy(test_labels_norris, preds))
            test_labels_rgz = [swire_name_to_rgz_label[swire_names[i]] for i in test]
            rgz_accuracies.append(
                crowdastro.crowd.util.balanced_accuracy(test_labels_rgz, preds))
        yield subset_str, (rgz_accuracies, norris_accuracies), predictions

In [48]:
norris_lr_results = list(test_on_sets(
        training_testing_swire_sets, swire_name_to_norris_label,
        lambda: sklearn.linear_model.LogisticRegression(class_weight='balanced', penalty='l2', C=1e10)))
for subset, (rgz_acc, norris_acc), _ in norris_lr_results:
    print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
    print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))


/usr/local/lib/python3.6/site-packages/sklearn/linear_model/base.py:352: RuntimeWarning: overflow encountered in exp
  np.exp(prob, prob)
RGZ & Norris & compact on RGZ: (83.95 +- 2.29)%
RGZ & Norris & compact on Norris: (96.39 +- 1.22)%
RGZ & Norris & resolved on RGZ: (80.95 +- 4.80)%
RGZ & Norris & resolved on Norris: (89.59 +- 2.73)%
RGZ & Norris on RGZ: (84.24 +- 1.02)%
RGZ & Norris on Norris: (94.62 +- 1.41)%

Training RF with Norris labels


In [55]:
norris_rf_results = list(test_on_sets(
        training_testing_swire_sets, swire_name_to_norris_label,
        lambda: sklearn.ensemble.RandomForestClassifier(class_weight='balanced',
                                                        criterion='entropy',
                                                        min_samples_leaf=45)))

for subset, (rgz_acc, norris_acc), _ in norris_rf_results:
    print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
    print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))


RGZ & Norris & compact on RGZ: (79.97 +- 0.21)%
RGZ & Norris & compact on Norris: (97.08 +- 0.90)%
RGZ & Norris & resolved on RGZ: (77.41 +- 4.14)%
RGZ & Norris & resolved on Norris: (94.89 +- 2.00)%
RGZ & Norris on RGZ: (78.43 +- 1.85)%
RGZ & Norris on Norris: (95.77 +- 2.39)%

Training LR with RGZ labels


In [50]:
rgz_lr_results = list(test_on_sets(
        training_testing_swire_sets, swire_name_to_rgz_label,
        lambda: sklearn.linear_model.LogisticRegression(class_weight='balanced', penalty='l2', C=1e10)))
for subset, (rgz_acc, norris_acc), _ in rgz_lr_results:
    print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
    print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))


RGZ & Norris & compact on RGZ: (86.28 +- 1.70)%
RGZ & Norris & compact on Norris: (94.22 +- 0.59)%
RGZ & Norris & resolved on RGZ: (80.78 +- 3.95)%
RGZ & Norris & resolved on Norris: (85.63 +- 4.33)%
RGZ & Norris on RGZ: (85.97 +- 0.55)%
RGZ & Norris on Norris: (92.52 +- 1.15)%

Training RF with RGZ labels


In [54]:
rgz_rf_results = list(test_on_sets(
        training_testing_swire_sets, swire_name_to_rgz_label,
        lambda: sklearn.ensemble.RandomForestClassifier(class_weight='balanced',
                                                        criterion='entropy',
                                                        min_samples_leaf=45)))
for subset, (rgz_acc, norris_acc), _ in rgz_rf_results:
    print('{} on RGZ: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(rgz_acc) * 100, numpy.std(rgz_acc) * 100))
    print('{} on Norris: ({:.02f} +- {:.02f})%'.format(
        subset, numpy.mean(norris_acc) * 100, numpy.std(norris_acc) * 100))


RGZ & Norris & compact on RGZ: (92.39 +- 0.67)%
RGZ & Norris & compact on Norris: (94.94 +- 0.37)%
RGZ & Norris & resolved on RGZ: (88.16 +- 2.81)%
RGZ & Norris & resolved on Norris: (92.06 +- 0.38)%
RGZ & Norris on RGZ: (92.36 +- 0.74)%
RGZ & Norris on Norris: (94.69 +- 0.57)%

In [56]:
plt.figure(figsize=(10, 4))
for i, (subset_str, _) in enumerate(subsets[:3]):
    plt.subplot(1, 3, i + 1)
    plt.title(subset_str.replace('&', '$\\cap$'))
    plt.xticks([0, 1, 2, 3], ['LR(RGZ)', 'LR(Norris)', 'RF(RGZ)', 'RF(Norris)'], rotation='vertical')
    plt.grid(axis='y', which='major', color='lightgrey', linestyle='-')
    plt.ylim((50, 100))
    plt.xlim((-1, 4))
    
    if i == 1:
        plt.xlabel('Classifier(Training set)', labelpad=20)
    if i == 0:
        plt.ylabel('Balanced accuracy (%)')

    lr_rgz_acc, lr_norris_acc = [numpy.array(res) * 100 for sstr, res, _ in rgz_lr_results if sstr == subset_str][0]
    rf_rgz_acc, rf_norris_acc = [numpy.array(res) * 100 for sstr, res, _ in rgz_rf_results if sstr == subset_str][0]
    xs = [i for j in range(4) for i in [j] * 4]
    ys = list(lr_rgz_acc) + list(lr_norris_acc) + list(rf_rgz_acc) + list(rf_norris_acc)
    plt.scatter(xs, ys, marker='x')
plt.subplots_adjust(wspace=0.4, bottom=0.3)
plt.savefig('/Users/alger/Documents/writing/atlas-ml-ba.pdf')
plt.show()


Table of predictions

We now want to generate a table of predictions. We will first generate a table of SWIRE-based predictions. Each SWIRE object will be assigned a label from each of two classifiers:

  • LR(Norris)
  • LR(RGZ)

These are trained on all data points in the training set, i.e., not splitting on compact/resolved. We report the labels from when the given SWIRE object was in a testing quadrant.


In [57]:
swire_name_to_rgz_lr_pred = collections.defaultdict(list)
swire_name_to_norris_lr_pred = collections.defaultdict(list)

for subset_str, _, preds in rgz_lr_results:
    for preds_ in preds:
        for name, pred in preds_.items():
            swire_name_to_rgz_lr_pred[name].append(pred)

for subset_str, _, preds in norris_lr_results:
    for preds_ in preds:
        for name, pred in preds_.items():
            swire_name_to_norris_lr_pred[name].append(pred)

In [58]:
import astropy.table
names_ = []
rgz_preds_ = []
norris_preds_ = []
for name in swire_names:
    if name not in swire_name_to_rgz_lr_pred:
        continue

    names_.append(name)
    rgz_ = numpy.mean([i[1] for i in swire_name_to_rgz_lr_pred[name]])
    norris_ = numpy.mean([i[1] for i in swire_name_to_norris_lr_pred[name]])
    rgz_preds_.append(rgz_)
    norris_preds_.append(norris_)
predicted_swire_table = astropy.table.Table(data=[names_, rgz_preds_, norris_preds_],
                                            names=['swire', 'lr(rgz)', 'lr(norris)'])
predicted_swire_table.write('/Users/alger/data/Crowdastro/predicted_swire_table_11_05_17.csv', format='csv')
predicted_swire_table.write('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/predicted_swire_table_11_05_17.tex',
                            format='latex')
predicted_swire_table


Out[58]:
<Table length=12509>
swirelr(rgz)lr(norris)
str26float64float64
SWIRE3_J032559.15-284724.20.01034259631050.0140876973827
SWIRE3_J032559.91-284728.90.2077744693210.0197342707073
SWIRE3_J032600.02-284736.90.1478225006150.0116599079929
SWIRE3_J032600.13-284637.50.06096639557180.021059589621
SWIRE3_J032600.13-284715.70.1729038565930.0608948984168
SWIRE3_J032600.98-284705.40.05567613878920.102366961283
SWIRE3_J032601.03-284711.60.3753653524030.168507910493
SWIRE3_J032601.75-284614.50.2763123909740.0449587530623
SWIRE3_J032602.08-284713.10.6514845537460.425380638918
SWIRE3_J032602.36-284711.50.4990847966420.550609670906
.........
SWIRE3_J033555.99-272756.90.346160949250.00289945384979
SWIRE3_J033556.19-280911.10.03682076881380.000416389336006
SWIRE3_J033556.25-272734.70.04927269677330.000566468062323
SWIRE3_J033556.36-272701.20.008001824124640.000117980019698
SWIRE3_J033556.36-272701.30.006638819032060.000102254811687
SWIRE3_J033556.51-280917.50.1300373586380.000755857118439
SWIRE3_J033556.63-272759.90.01441354318820.000263225031745
SWIRE3_J033556.71-272753.00.1086094488140.00081326735078
SWIRE3_J033556.85-272804.30.03441722793380.000380044647789
SWIRE3_J033557.13-280957.40.2580345301190.00708681070039

Table of cross-identifications

Finally, we want to map these predictions back to the original task. Given an ATLAS object, get all SWIRE objects within 1' of that object. The highest probability host is then named as the host galaxy.


In [59]:
# Map Zooniverse ID -> [SWIRE from RGZ]
zid_to_rgz_consensus_swires = collections.defaultdict(list)
zid_to_rgz_consensus_radio = {}
zid_to_rgz_consensus_ir = collections.defaultdict(list)
for row in rgz_catalogue:
    zid_to_rgz_consensus_swires[row['zooniverse_id']].append(row['SWIRE.designation'])
    zid_to_rgz_consensus_ir[row['zooniverse_id']].append(row['consensus.ir_level'])
    zid_to_rgz_consensus_radio[row['zooniverse_id']] = row['consensus.radio_level']

In [60]:
zids_ = []
ras_ = []
decs_ = []
rgz_names_ = []
norris_names_ = []
rgz_consensuses_radio_level_ = []
rgz_consensuses_ir_level_ = []
rgz_consensuses_ = []
for row in table:
    if row['Key'] in rgz & norris:
        zid = row['Component Zooniverse ID (RGZ)']
        ra = row['Component RA (Franzen)']
        dec = row['Component DEC (Franzen)']
        nearby = swire_tree.query_ball_point([ra, dec], 1 / 60)
        names = [swire_names[i] for i in nearby]
        if not names:
            print('{} not found'.format(zid))
            continue
        norris_probs = [numpy.mean([i[1] for i in swire_name_to_norris_lr_pred[name]]) for name in names]
        norris_name = names[numpy.argmax(norris_probs)]
        rgz_probs = [numpy.mean([i[1] for i in swire_name_to_rgz_lr_pred[name]]) for name in names]
        rgz_name = names[numpy.argmax(rgz_probs)]
        zids_.append(zid)
        ras_.append(ra)
        decs_.append(dec)
        rgz_names_.append(rgz_name)
        norris_names_.append(norris_name)
        rgz_consensuses_.append(','.join(i for i in zid_to_rgz_consensus_swires.get(zid, [])))
        rgz_consensuses_radio_level_.append(zid_to_rgz_consensus_radio.get(zid))
        rgz_consensuses_ir_level_.append(','.join(str(i) for i in zid_to_rgz_consensus_ir.get(zid, [])))
cross_id_table = astropy.table.Table(data=[zids_, ras_, decs_, rgz_names_, norris_names_,
                                           rgz_consensuses_, rgz_consensuses_radio_level_, rgz_consensuses_ir_level_],
                                     names=['zooniverse_id', 'ra', 'dec', 'lr(rgz)_swire', 'lr(norris)_swire',
                                            'rgz_swire', 'rgz_consensus_radio_level', 'rgz_consensus_ir_level'])
cross_id_table.write('/Users/alger/data/Crowdastro/predicted_cross_ids_table_11_05_17.csv', format='csv')
cross_id_table.write('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/predicted_cross_ids_table_11_05_17.tex', format='latex')
cross_id_table


ARG0003spl not found
Out[60]:
<Table length=544>
zooniverse_idradeclr(rgz)_swirelr(norris)_swirergz_swirergz_consensus_radio_levelrgz_consensus_ir_level
str10float64float64str26str26str188objectstr74
ARG0003rb251.511734-28.785575SWIRE3_J032605.76-284711.8SWIRE3_J032602.36-284711.5-990.40.333333333333
ARG0003rfr51.564555-28.774847SWIRE3_J032615.41-284630.7SWIRE3_J032615.41-284630.7SWIRE3_J032616.14-284552.9,SWIRE3_J032615.41-284630.70.31251.0,1.0
ARG0003r8s51.564799-28.099955SWIRE3_J032615.52-280559.8SWIRE3_J032615.52-280559.8SWIRE3_J032616.71-280538.6,SWIRE3_J032617.94-280648.2,SWIRE3_J032615.52-280559.80.4848484848480.75,0.555555555556,0.8125
ARG0003r2j51.572279-28.119491SWIRE3_J032615.86-280628.8SWIRE3_J032617.02-280638.9SWIRE3_J032617.89-280707.20.4210526315791.0
ARG0003raz51.604711-28.152731SWIRE3_J032625.19-280910.1SWIRE3_J032625.19-280910.1SWIRE3_J032624.80-280915.90.30.333333333333
ARG0003ro451.621251-28.113924SWIRE3_J032629.13-280650.7SWIRE3_J032629.13-280650.7SWIRE3_J032629.13-280650.7,SWIRE3_J032626.74-280636.70.3571428571430.8,1.0
ARG0003r8e51.623385-28.681315SWIRE3_J032629.54-284055.8SWIRE3_J032629.54-284055.8SWIRE3_J032629.54-284055.80.31.0
ARG0003r3w51.624653-28.798195SWIRE3_J032630.12-284751.2SWIRE3_J032629.81-284754.4SWIRE3_J032629.81-284754.41.00.666666666667
ARG0003r5551.62777-28.615917SWIRE3_J032630.64-283658.0SWIRE3_J032630.64-283658.0SWIRE3_J032630.64-283658.0,SWIRE3_J032628.56-283744.80.3548387096771.0,0.727272727273
ARG0003rj251.644117-28.339678SWIRE3_J032631.96-281941.0SWIRE3_J032634.58-282022.8SWIRE3_J032630.21-282025.5,SWIRE3_J032634.58-282022.8,SWIRE3_J032631.96-281941.00.593750.684210526316,0.947368421053,0.473684210526
........................
ARG0003ra953.87272-28.865327SWIRE3_J033529.46-285154.5SWIRE3_J033529.46-285154.5SWIRE3_J033529.46-285154.50.4210526315790.75
ARG0003rbx53.875955-28.185517SWIRE3_J033530.19-281108.5SWIRE3_J033530.19-281108.5SWIRE3_J033530.19-281108.5,SWIRE3_J033530.34-281025.50.6388888888890.826086956522,1.0
ARG0003r1v53.879392-27.450616SWIRE3_J033534.30-272705.7SWIRE3_J033531.45-272658.6SWIRE3_J033529.12-272726.20.43750.285714285714
ARG0003ra753.884153-28.800544SWIRE3_J033532.19-284801.1SWIRE3_J033532.19-284801.1SWIRE3_J033532.19-284801.10.2222222222221.0
ARG0003r5953.891569-27.553515SWIRE3_J033533.90-273310.9SWIRE3_J033533.90-273310.9SWIRE3_J033533.90-273310.9,SWIRE3_J033530.61-273323.70.5294117647060.777777777778,0.888888888889
ARG0003r6f53.904304-27.575789SWIRE3_J033537.03-273432.9SWIRE3_J033537.03-273432.9SWIRE3_J033537.03-273432.9,SWIRE3_J033532.80-273456.70.2941176470591.0,1.0
ARG0003rdx53.911449-28.50367SWIRE3_J033538.73-283012.9SWIRE3_J033538.73-283012.9SWIRE3_J033538.73-283012.90.2222222222221.0
ARG0003r7g53.94346-28.634766SWIRE3_J033546.39-283805.0SWIRE3_J033546.39-283805.0SWIRE3_J033549.55-283850.7,SWIRE3_J033546.39-283805.00.4117647058821.0,0.857142857143
ARG0003r1c53.972284-27.461299SWIRE3_J033553.33-272740.4SWIRE3_J033553.33-272740.4SWIRE3_J033553.33-272740.4,SWIRE3_J033552.00-272650.60.7777777777780.857142857143,1.0
ARG0003rc953.973537-28.165283SWIRE3_J033553.56-280952.4SWIRE3_J033553.56-280952.4SWIRE3_J033553.56-280952.4,SWIRE3_J033556.74-280906.9,SWIRE3_J033550.25-281026.00.4242424242421.0,1.0,1.0

Export

Finally, let's export all our ATLAS and SWIRE sets.


In [61]:
import pickle

with open('/Users/alger/data/Crowdastro/sets_atlas_11_05_17.pkl', 'wb') as f:
    pickle.dump(training_testing_atlas_sets, f)

with open('/Users/alger/data/Crowdastro/sets_swire_11_05_17.pkl', 'wb') as f:
    pickle.dump(training_testing_swire_sets, f)

Export for Full Training

We need to export the SWIRE features and labels. In principle we can reconstruct our features etc. later, so we don't need to export the names, but we will anyway.


In [62]:
def export(training_testing_swire_sets, filename):
    with h5py.File(filename, 'w') as f:
        # Export features.
        f.create_dataset('features', data=swire_features)
        # Export names.
        assert all(len(s) <= 26 for s in swire_names)
        f.create_dataset('names', data=[i.encode('ascii') for i in swire_names], dtype='<S26')
        # Export labels.
        f.create_dataset('norris_labels', data=numpy.array([
            swire_name_to_norris_label[n] for n in swire_names], dtype=bool))
        f.create_dataset('rgz_labels', data=numpy.array([
            swire_name_to_rgz_label[n] for n in swire_names], dtype=bool))
        # Export train/test sets.
        sets = f.create_group('sets')
        for subset_str, splits in training_testing_swire_sets.items():
            subset = sets.create_group(subset_str)
            train_bool = numpy.zeros((len(splits), len(swire_names)), dtype=bool)
            test_bool = numpy.zeros((len(splits), len(swire_names)), dtype=bool)
            for i, (train, test) in enumerate(splits):
                train_bool[i, train] = True
                test_bool[i, test] = True
            subset.create_dataset('train', data=train_bool)
            subset.create_dataset('test', data=test_bool)

In [63]:
export(training_testing_swire_sets, '/Users/alger/data/Crowdastro/all_training_data_11_05_17.h5')

In [68]:
swire_features.shape[1] - 1024


Out[68]:
10

In [ ]: