Reproducing astroNN distance prediction result on Gaia TGAS DR1/ Gaia DR2

This notebook will walk you through how to reproduce astroNN gaia distance prediction result as claimed by astroNN author.

This notebook may required mw_plot which is decribed at https://github.com/henrysky/milkyway_plot

To save time, you can scroll down and use my trained model with ID: astroNN_0422_run001

This notebook is no longer being maintained. Moreover, starting from 28 May 2018, astroNN defaulted to use Gaia DR2 instead of DR1 to compile dataset. So if you want to compile the dataset used here, please use astroNN commited before 28 May 2018


In [2]:
from astroNN.apogee import allstar, visit_spectra
from astropy.io import fits
from astroNN.datasets import xmatch
import numpy as np
from astroNN.gaia import tgas_load, mag_to_fakemag
from astroNN.apogee.chips import apogee_continuum, gap_delete
from astroNN.nn.losses import mean_absolute_percentage_error
import time

# Setup continuum normalization
target_bit = [0, 1, 2, 3, 4, 5, 6, 7, 12]
def apstar_normalization(spectra, spectra_err, _spec_mask):
    return apogee_continuum(spectra=spectra, spectra_err=spectra_err, cont_mask=None, deg=2, dr=14, bitmask=_spec_mask)

# Load Gaia TGAS DR1, only load parallax > 0 and parallax error < 20%
output = tgas_load(cuts=0.2)

# Load Apogee allstar DR14
allstar_data = fits.open(allstar(dr=14))

# We only want APOGEE spectra from APO 2.5m telescope and spectra without any starflag
good_idx = [(allstar_data[1].data['telescope']=='apo25m')&(allstar_data[1].data['starflag']==0)]

# We only want APOGEE spectra from APO 2.5m telescope and spectra without any starflag
idx_1, idx_2, sep = xmatch(allstar_data[1].data['RA'][good_idx], output['ra'], colRA1=allstar_data[1].data['RA'][good_idx], 
                           colDec1=allstar_data[1].data['DEC'][good_idx], colRA2=output['ra'], colDec2=output['dec'], 
                           colpmRA2=output['pmra'], colpmDec2=output['pmdec'], swap=False)

print('Total number of spectra in training set: ', idx_1.shape)

delete_list = []

# pre-allocating spectra array
spec = np.zeros((idx_1.shape[0], 7514))

start_time = time.time()
for counter, i in enumerate(idx_1):
    ap_path = visit_spectra(dr=14, apogee=allstar_data[1].data['APOGEE_ID'][good_idx][i],location=allstar_data[1].data['LOCATION_ID'][good_idx][i], verbose=0)
    apstar_file = fits.open(ap_path)
    # we only want combined spectra
    nvisits = apstar_file[0].header['NVISITS']
    if nvisits == 1:
        _spec = apstar_file[1].data
        _spec_err = apstar_file[2].data
        _spec_mask = apstar_file[3].data
    else:
        _spec = apstar_file[1].data[1]
        _spec_err = apstar_file[2].data[1]
        _spec_mask = apstar_file[3].data[1]
    _spec, _spec_err = apstar_normalization(_spec, _spec_err, _spec_mask)
    
    spec[counter] =  _spec
    if counter % 100 == 0:
        print(f'Completed {counter} of {idx_1.shape[0]}, {(time.time() - start_time):.{2}f}s elapsed')

idx_1=np.delete(idx_1, delete_list)
idx_2=np.delete(idx_2, delete_list)

kmag = allstar_data[1].data['K'][good_idx][idx_1]
fakemag, fakemag_err = mag_to_fakemag(kmag, output['parallax'][idx_2], output['parallax_err'][idx_2])

# save all the reducted data
np.save('spec.npy', spec)
np.save('fakemag.npy', fakemag)
np.save('fakemag_err.npy', fakemag_err)


E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-000.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-001.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-002.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-003.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-004.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-005.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-006.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-007.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-008.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-009.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-010.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-011.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-012.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-013.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-014.fits was found!
E:\gaia_mirror\Gaia/gdr1/tgas_source/fits/TgasSource_000-000-015.fits was found!
E:\sdss_mirror\dr14/apogee/spectro/redux/r8/stars/l31c/l31c.2/allStar-l31c.2.fits was found!
Total number of spectra in training set:  (11248,)
Completed 0 of 11248, 0.16s elapsed
Completed 100 of 11248, 2.17s elapsed
Completed 200 of 11248, 4.18s elapsed
Completed 300 of 11248, 6.18s elapsed
Completed 400 of 11248, 8.20s elapsed
Completed 500 of 11248, 10.36s elapsed
Completed 600 of 11248, 12.55s elapsed
Completed 700 of 11248, 14.69s elapsed
Completed 800 of 11248, 17.12s elapsed
Completed 900 of 11248, 19.65s elapsed
Completed 1000 of 11248, 22.55s elapsed
Completed 1100 of 11248, 25.30s elapsed
Completed 1200 of 11248, 28.05s elapsed
Completed 1300 of 11248, 30.88s elapsed
Completed 1400 of 11248, 33.53s elapsed
Completed 1500 of 11248, 36.14s elapsed
Completed 1600 of 11248, 38.72s elapsed
Completed 1700 of 11248, 41.48s elapsed
Completed 1800 of 11248, 44.26s elapsed
Completed 1900 of 11248, 46.70s elapsed
Completed 2000 of 11248, 49.20s elapsed
Completed 2100 of 11248, 51.81s elapsed
Completed 2200 of 11248, 54.41s elapsed
Completed 2300 of 11248, 56.98s elapsed
Completed 2400 of 11248, 59.58s elapsed
Completed 2500 of 11248, 62.13s elapsed
Completed 2600 of 11248, 64.73s elapsed
Completed 2700 of 11248, 67.25s elapsed
Completed 2800 of 11248, 69.92s elapsed
Completed 2900 of 11248, 72.60s elapsed
Completed 3000 of 11248, 75.25s elapsed
Completed 3100 of 11248, 77.82s elapsed
Completed 3200 of 11248, 80.39s elapsed
Completed 3300 of 11248, 83.02s elapsed
Completed 3400 of 11248, 85.71s elapsed
Completed 3500 of 11248, 88.35s elapsed
Completed 3600 of 11248, 91.01s elapsed
Completed 3700 of 11248, 93.71s elapsed
Completed 3800 of 11248, 96.28s elapsed
Completed 3900 of 11248, 98.83s elapsed
Completed 4000 of 11248, 101.58s elapsed
Completed 4100 of 11248, 104.51s elapsed
Completed 4200 of 11248, 107.24s elapsed
Completed 4300 of 11248, 109.92s elapsed
Completed 4400 of 11248, 112.59s elapsed
Completed 4500 of 11248, 115.29s elapsed
Completed 4600 of 11248, 117.97s elapsed
Completed 4700 of 11248, 120.56s elapsed
Completed 4800 of 11248, 123.05s elapsed
Completed 4900 of 11248, 125.51s elapsed
Completed 5000 of 11248, 127.97s elapsed
Completed 5100 of 11248, 130.49s elapsed
Completed 5200 of 11248, 133.08s elapsed
Completed 5300 of 11248, 135.70s elapsed
Completed 5400 of 11248, 138.36s elapsed
Completed 5500 of 11248, 140.90s elapsed
Completed 5600 of 11248, 143.47s elapsed
Completed 5700 of 11248, 146.24s elapsed
Completed 5800 of 11248, 148.86s elapsed
Completed 5900 of 11248, 151.54s elapsed
Completed 6000 of 11248, 154.18s elapsed
Completed 6100 of 11248, 156.82s elapsed
Completed 6200 of 11248, 159.32s elapsed
Completed 6300 of 11248, 161.88s elapsed
Completed 6400 of 11248, 164.47s elapsed
Completed 6500 of 11248, 167.01s elapsed
Completed 6600 of 11248, 169.57s elapsed
Completed 6700 of 11248, 172.15s elapsed
Completed 6800 of 11248, 174.76s elapsed
Completed 6900 of 11248, 177.60s elapsed
Completed 7000 of 11248, 180.10s elapsed
Completed 7100 of 11248, 182.54s elapsed
Completed 7200 of 11248, 184.92s elapsed
Completed 7300 of 11248, 187.38s elapsed
Completed 7400 of 11248, 189.93s elapsed
Completed 7500 of 11248, 192.47s elapsed
Completed 7600 of 11248, 195.13s elapsed
Completed 7700 of 11248, 197.73s elapsed
Completed 7800 of 11248, 200.37s elapsed
Completed 7900 of 11248, 203.01s elapsed
Completed 8000 of 11248, 205.61s elapsed
Completed 8100 of 11248, 208.20s elapsed
Completed 8200 of 11248, 210.73s elapsed
Completed 8300 of 11248, 213.28s elapsed
Completed 8400 of 11248, 215.79s elapsed
Completed 8500 of 11248, 218.39s elapsed
Completed 8600 of 11248, 221.02s elapsed
Completed 8700 of 11248, 223.64s elapsed
Completed 8800 of 11248, 226.25s elapsed
Completed 8900 of 11248, 229.01s elapsed
Completed 9000 of 11248, 231.77s elapsed
Completed 9100 of 11248, 234.32s elapsed
Completed 9200 of 11248, 236.99s elapsed
Completed 9300 of 11248, 239.82s elapsed
Completed 9400 of 11248, 242.65s elapsed
Completed 9500 of 11248, 245.32s elapsed
Completed 9600 of 11248, 247.93s elapsed
Completed 9700 of 11248, 250.82s elapsed
Completed 9800 of 11248, 253.54s elapsed
Completed 9900 of 11248, 256.21s elapsed
Completed 10000 of 11248, 258.84s elapsed
Completed 10100 of 11248, 261.55s elapsed
Completed 10200 of 11248, 264.32s elapsed
Completed 10300 of 11248, 267.10s elapsed
Completed 10400 of 11248, 269.91s elapsed
Completed 10500 of 11248, 272.75s elapsed
Completed 10600 of 11248, 275.68s elapsed
Completed 10700 of 11248, 278.53s elapsed
Completed 10800 of 11248, 281.38s elapsed
Completed 10900 of 11248, 284.23s elapsed
Completed 11000 of 11248, 287.09s elapsed
Completed 11100 of 11248, 289.89s elapsed
Completed 11200 of 11248, 292.58s elapsed
Please be advised that astroNN fakemag is parallax(mas) * 10 ** (0.2 * mag)

Load the reduced data and train a neural network

You can save time by using the neural network trained on 22 April 2018


In [ ]:
# load reduced data
spec = np.load('spec.npy')
fakemag = np.load('fakemag.npy')
fakemag_err = np.load('fakemag_err.npy')

# Train a neural Network and save
from astroNN.models import ApogeeBCNN
bcnn = ApogeeBCNN()
bcnn.max_epochs = 60
bcnn.dropout_rate = 0.3
bcnn.num_hidden = [128, 64]
bcnn.l2 = 1e-5
bcnn.targename=['fakemag']  # corectly set output neurone representation which is a good practice in astroNN
bcnn.autosave=True  # autosave after finising training
bcnn.train(spec, fakemag, labels_err=fakemag_err)

Compile Testing Set and do Inference

You can save time by using the result directly by reading _astronn_dist.npy and _astronn_dist_err.npy

Since we want data-driven distance prediction for the whole APOGEE DR14, we need to compile a data set so we can setup inference with neural network easily


In [ ]:
import numpy as np
import time
import h5py
from astropy.io import fits

from astroNN.apogee import allstar, visit_spectra
from astroNN.datasets import xmatch
from astroNN.gaia import tgas_load, mag_to_fakemag
from astroNN.apogee.chips import apogee_continuum, gap_delete
from astroNN.nn.losses import mean_absolute_percentage_error
from astroNN.datasets import load_apogee_distances

# load apogee distances data, we dont want cuts because we dont really use it, or you can cut bad prediction later
aRA, aDEC, metrics_array, metrics_err_array = load_apogee_distances(dr=14, metric='distance', cuts=False)

# Setup continuum normalization
target_bit = [0, 1, 2, 3, 4, 5, 6, 7, 12]
def apstar_normalization(spectra, spectra_err, _spec_mask):
    return apogee_continuum(spectra=spectra, spectra_err=spectra_err, cont_mask=None, deg=2, dr=14, bitmask=_spec_mask)

# Load Gaia TGAS DR1, only load parallax > 0 and parallax error < 20%
output = tgas_load(cuts=0.2)
allstar_data = fits.open(allstar(dr=14))

# We only want APOGEE spectra from APO 2.5m telescope, spectra without any starflag and spectra with SNR higher than 50
good_idx = [(allstar_data[1].data['telescope']=='apo25m')&(allstar_data[1].data['starflag']==0)&
            (allstar_data[1].data['SNR']>50)]

loop_var = np.arange(allstar_data[1].data['telescope'].shape[0])[good_idx]

# pre-allocating spectra array
spec = np.zeros((np.sum(good_idx), 7514), dtype=np.float32)

start_time = time.time()
for counter, i in enumerate(loop_var):
    ap_path = visit_spectra(dr=14, apogee=allstar_data[1].data['APOGEE_ID'][i], 
                            location=allstar_data[1].data['LOCATION_ID'][i], verbose=0)
    if ap_path is False:
        pass
    else:
        apstar_file = fits.open(ap_path)
        # we only want combined spectra
        nvisits = apstar_file[0].header['NVISITS']
        if nvisits == 1:
            _spec = apstar_file[1].data
            _spec_err = apstar_file[2].data
            _spec_mask = apstar_file[3].data
        else:
            _spec = apstar_file[1].data[1]
            _spec_err = apstar_file[2].data[1]
            _spec_mask = apstar_file[3].data[1]

        _spec, _spec_err = apstar_normalization(_spec, _spec_err, _spec_mask)

        spec[counter] =  _spec
    if counter % 100 == 0:
        print(f'Completed {counter} of {np.sum(good_idx)}, {(time.time() - start_time):.{2}f}s elapsed')

# Create the testing set file
h5f = h5py.File(f'apogee4gaia_dr2.h5', 'w')
h5f.create_dataset('RA', data=allstar_data[1].data['RA'][good_idx])
h5f.create_dataset('DEC', data=allstar_data[1].data['DEC'][good_idx])
h5f.create_dataset('K', data=allstar_data[1].data['K'][good_idx])
h5f.create_dataset('apogee_dist', data=metrics_array[good_idx])
h5f.create_dataset('apogee_dist_err', data=metrics_err_array[good_idx])
h5f.create_dataset('spectra', data=spec)
h5f.create_dataset('teff', data=allstar_data[1].data['PARAM'][good_idx][:,0])
h5f.create_dataset('SNR', data=allstar_data[1].data['SNR'][good_idx])
h5f.create_dataset('MH', data=allstar_data[1].data['M_H'][good_idx])
h5f.create_dataset('FeH', data=allstar_data[1].data['Fe_H'][good_idx])
h5f.create_dataset('logg', data=allstar_data[1].data['LOGG'][good_idx])
h5f.close()

In [ ]:
from astroNN.models import load_folder
from astroNN.gaia import fakemag_to_pc
import h5py
import numpy as np

bcnn = load_folder('astroNN_0422_run001')

h5f = h5py.File(f'apogee4gaia_dr2.h5', 'r')
spec = np.array(h5f['spectra'])
Kmag = np.array(h5f['K'])
adpc = np.array(h5f['apogee_dist'])
adpc_err = np.array(h5f['apogee_dist_err'])

# prediction and its uncertainty, they are fakemag and fakemag uncertainty in this case
pred, pred_err = bcnn.test(spec)

# convert fakemag and propagated uncertainty to distances
astronn_pc, astronn_pc_err = fakemag_to_pc(pred[:, 0], Kmag, pred_err['total'][:, 0])

# setup the distances so we can check whether neural network is good on 25 April when Gaia DR2 release
np.save('_astronn_dist.npy', pred)
np.save('_astronn_dist_err.npy', pred_err['total'])

Comparing Bayesian Neural Network to Gaia DR2


In [4]:
from astroNN.gaia import fakemag_to_pc
import h5py
import numpy as np
from astroNN.datasets import xmatch
from astroNN.nn.numpy import mean_absolute_percentage_error
from astroNN.gaia import gaiadr2_parallax

pred = np.load('_astronn_dist.npy')
pred_err = np.load('_astronn_dist_err.npy')

h5f = h5py.File(f'apogee4gaia_dr2.h5', 'r')
adpc = np.array(h5f['apogee_dist'])
apogeeRA = np.array(h5f['RA'])
apogeeDEC = np.array(h5f['DEC'])
adpc_err = np.array(h5f['apogee_dist_err'])
Kmag = np.array(h5f['K'], dtype=np.float32)
astronn_pc, astronn_pc_err = fakemag_to_pc(pred[:, 0], Kmag, pred_err[:, 0])
astroNN_uncerpre = astronn_pc_err.value / (astronn_pc.value + 1e-8)

# To load Gaia DR2 - APOGEE DR14 matches, indices corresponds to APOGEE allstar DR14 file
ted_ra, ted_dec, ted_pc, ted_pc_err = gaiadr2_parallax(cuts=0.2, keepdims=False)

good_idx_astroNN = [(astroNN_uncerpre < 0.2) & (astronn_pc.value > 0.)]
idx_1, idx_2, sep = xmatch(apogeeRA[good_idx_astroNN], ted_ra, colRA1=apogeeRA[good_idx_astroNN],
                           colDec1=apogeeDEC[good_idx_astroNN], colRA2=ted_ra,
                           colDec2=ted_dec, swap=False)

astroNNXM = astronn_pc[good_idx_astroNN][idx_1].value
gaiaXM = (1000/ted_pc)[idx_2]

mape_dr2_apogeedist = mean_absolute_percentage_error(astroNNXM, gaiaXM)

print(f'Mean Absolute Precentage Error: {mape_dr2_apogeedist:.{2}f}% for {astroNNXM.shape[0]} spectra at 20% NN uncertainty cuts')


This is Gaia DR2 - APOGEE DR14 matched parallax, RA DEC in J2000, parallax in mas
Mean Absolute Precentage Error: 14.48% for 57169 spectra at 20% NN uncertainty cuts

In [5]:
from astroNN.gaia import fakemag_to_pc
from mw_plot import MWPlot
from astroNN.gaia import tgas_load
from astropy import units as  u
import astropy.coordinates as apycoords
from astropy.io import fits
import h5py
import numpy as np

pred = np.load('_astronn_dist.npy')
pred_err = np.load('_astronn_dist_err.npy')

h5f = h5py.File(f'apogee4gaia_dr2.h5', 'r')
adpc = np.array(h5f['apogee_dist'])
apogeeRA = np.array(h5f['RA'])
apogeeDEC = np.array(h5f['DEC'])
adpc_err = np.array(h5f['apogee_dist_err'])

Kmag = np.array(h5f['K'], dtype=np.float32)
astronn_pc, astronn_pc_err = fakemag_to_pc(pred[:, 0], Kmag, pred_err[:, 0])
astroNN_uncerpre = astronn_pc_err.value / (astronn_pc.value + 1e-8)
good_idx_astroNN = [(astroNN_uncerpre < 1.0) & (astronn_pc.value > 0.)]

# use astropy coordinates tranformation
c = apycoords.SkyCoord(ra=apogeeRA[good_idx_astroNN]*u.deg, dec=apogeeDEC[good_idx_astroNN]*u.deg, 
                       distance=astronn_pc[good_idx_astroNN], frame='icrs')

# setup a MWPlot instance
# use galactic coordinates because Gaia observations are from Earth
plot_instance = MWPlot(radius=20*u.kpc, unit=u.kpc, coord='galactic')

plot_instance.s = 1.0  # make the scatter points bigger
plot_instance.dpi = 50 # increase this to increase image quality
plot_instance.imalpha = 0.5
plot_instance.cmap = 'plasma'
plot_instance.clim = (0, 0.5)
# plot, need to flip the sign of x because astropy is left-handed but mw_plot is right-handed
plot_instance.mw_plot(-c.galactic.cartesian.x, c.galactic.cartesian.y, [astroNN_uncerpre[good_idx_astroNN], 'astroNN Precentage Uncertainty'],
                   'astroNN Bayesian Neural Network Distance Prediction before Gaia DR2')

# Save the figure
# plot_instance.savefig(file='gaia_dr2_astronn.png')



In [4]:
from astroNN.gaia import fakemag_to_pc
from mw_plot import MWPlot
from astroNN.gaia import tgas_load
from astropy import units as  u
import astropy.coordinates as apycoords
from astropy.io import fits
import h5py
import numpy as np

pred = np.load('_astronn_dist.npy')
pred_err = np.load('_astronn_dist_err.npy')

h5f = h5py.File(f'apogee4gaia_dr2.h5', 'r')
adpc = np.array(h5f['apogee_dist'])
apogeeRA = np.array(h5f['RA'])
apogeeDEC = np.array(h5f['DEC'])
adpc_err = np.array(h5f['apogee_dist_err'])

Kmag = np.array(h5f['K'], dtype=np.float32)
astronn_pc, astronn_pc_err = fakemag_to_pc(pred[:, 0], Kmag, pred_err[:, 0])
astroNN_uncerpre = astronn_pc_err.value / (astronn_pc.value + 1e-8)
good_idx_astroNN = [(astroNN_uncerpre < 1.0) & (astronn_pc.value > 0.)]

# use astropy coordinates tranformation
c = apycoords.SkyCoord(ra=apogeeRA[good_idx_astroNN]*u.deg, dec=apogeeDEC[good_idx_astroNN]*u.deg, 
                       distance=astronn_pc[good_idx_astroNN], frame='icrs')

# setup a MWPlot instance
# use galactic coordinates because Gaia observations are from Earth
plot_instance = MWPlot(mode='edge-on', radius=10*u.kpc, unit=u.kpc, coord='galactic')

plot_instance.s = 1.0  # make the scatter points bigger
plot_instance.imalpha = .5
plot_instance.dpi = 50 # increase this to increase image quality
plot_instance.cmap = 'plasma'
plot_instance.clim = (0, 0.5)
# plot, need to flip the sign of x because astropy is left-handed but mw_plot is right-handed
plot_instance.mw_plot(-c.galactic.cartesian.x, c.galactic.cartesian.z, [astroNN_uncerpre[good_idx_astroNN], 'astroNN Precentage Uncertainty'],
                   'astroNN Bayesian Neural Network Distance Prediction before Gaia DR2')

# Save the figure
# plot_instance.savefig(file='gaia_dr2_astronn_edgeon.png')