In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from astropy.table import Table
import seaborn as sns
In [2]:
# Setting style
from matplotlib import rcParams
rcParams["font.family"] = "sans-serif"
rcParams["font.sans-serif"] = ["Computer Modern Sans"]
sns.set_style('white')
sns.set_palette("Paired", 2) #colorblind
sns.palplot(sns.color_palette())
In [3]:
# Reading the data, which can be downloaded here:
# http://ict.icrar.org/cutout/G10/dataRelease.php
tb4 = Table.read('../G10CosmosCatv04/G10CosmosCatv04.csv')
# Changing strange column names with spurious D
tb4['IA427_08_ERR'] = tb4['DIA427_08_ERR']
tb4['KC_08_ERR'] = tb4['DKC_08_ERR']
In [4]:
# Show column names
np.array(tb4.colnames)
Out[4]:
In [5]:
# Some sanity checks to see which objects have spectroscopic redshifts.
# Hopefully this should be equivalent to using Z_BEST with the flag Z_USE < 3
# and indeed, we recover similar numbers.
ind_vvds = tb4['Z_VVDS'] > 0
ind_vvds &= tb4['Z_VVDS'] < 4
ind_sdss = tb4['Z_SDSS'] > 0
ind_sdss &= tb4['Z_SDSS'] < 4
ind_sdss &= tb4['Z_ERR_SDSS'] < 0.001
ind_primus = tb4['Z_PRIMUS'] > 0
ind_primus &= tb4['Z_PRIMUS'] < 4
ind_primus &= tb4['ZQUALITY_PRIMUS'] >= 4
ind_zcosmos = tb4['Z_ZCOS'] > 0
ind_zcosmos &= tb4['Z_ZCOS'] < 4
ind_zcosmos &= tb4['ZQUALITY_ZCOS'] >= 4
ind = ind_sdss | ind_vvds | ind_primus | ind_zcosmos
ind.sum(), ind.size
Out[5]:
In [8]:
# Selecting objects with spectroscopic redshifts
ind_goodzspec = tb4['Z_USE'] < 3
ind_goodzspec &= tb4['Z_BEST'] > 0.0001
#ind_goodzspec &= tb4['Z_BEST'] < 0.2
# good extinction
ind_goodebv = tb4['EB_V_08'] > 0
# Not crazy photo-z's (because that would indicate a problem in the photometry)
ind_goodphotoz = (tb4['Z_BEST'] - tb4['ZP_COSMOS2015']) < 0.5
# and galaxies only.
ind_gal = tb4['CLASS_SDSS'] == 'GALAXY'
ind_gal |= tb4['STAR_GALAXY_CLASS'] == 0
ind_gal &= tb4['ZQUALITY_EYE'] != 10
# The total number is:
print((ind_goodzspec & ind_goodebv & ind_goodphotoz & ind_gal).sum())
# Now select the bands by their names, and also copy EBV values.
yr = '_08'
subaru_bands = [
'B', 'V', 'G', 'R', 'I', 'Z',
'NB816', 'IA427', 'IA464', 'IA505', 'IA574',
'IA709', 'IA827', 'NB711', 'IA484', 'IA527',
'IA624', 'IA679', 'IA738', 'IA767']
subaru_abebv = [
4.039, 3.147, 3.738, 2.586, 1.923, 1.436,
1.745, 4.260, 3.843, 3.425, 2.937,
2.289, 1.747, 2.268, 3.621, 3.264,
2.694, 2.430, 2.150, 1.996]
sdss_bands = ['U_SDSS', 'G_SDSS', 'R_SDSS', 'I_SDSS', 'Z_SDSS']
sdss_abebv = [4.239, 3.303, 2.285, 1.698, 1.263]
ctiokpno_bands = ['K']
ctiokpno_abebv = [0.302]
hst_dands = ['F814W']
hst_abebv = [1.536]
ukirt_bands = ['J']
ukirt_abebv = [0.709]
# Which bands are we considering?
bands = subaru_bands + sdss_bands + ctiokpno_bands + hst_dands + ukirt_bands
abebvs = subaru_abebv + sdss_abebv + ctiokpno_abebv + hst_abebv + ukirt_abebv
ZP = 12.5 # arbitrary zero point to obtain reasonnable flux numbers
# Convert magnitudes to fluxes and correct for extinction
for abebv, b in zip(abebvs, bands):
nm = b + yr
nme = b + yr + '_ERR'
themag = tb4[nm]
themag -= tb4['EB_V_08'] * abebv
theflux = 10**(-0.4*(themag - ZP))
thefluxerr = (10.**(.4*tb4[nme])-1.) * theflux
tb4[b+'_FLUX'] = theflux
tb4[b+'_FLUXVAR'] = np.abs(thefluxerr**2)
In [9]:
# Selection for the training and target sets, mostly SNR cuts on top of having spec-z's
ind1 = ind_goodzspec & ind_goodebv & ind_goodphotoz & ind_gal
ind2 = ind_goodzspec & ind_goodebv & ind_goodphotoz & ind_gal
ind1 &= tb4['Z_BEST'] < 1.5
ind2 &= tb4['Z_BEST'] < 1.5
for b in subaru_bands + ctiokpno_bands + hst_dands + ukirt_bands:
ind1 &= np.isfinite(tb4[b+'_FLUX'] / tb4[b+'_FLUXVAR']**0.5)
ind1 &= tb4[b+'_FLUX'] / tb4[b+'_FLUXVAR']**0.5 > 2
for b in ['R_SDSS', 'I_SDSS']:
ind2 &= tb4[b+'_FLUX'] / tb4[b+'_FLUXVAR']**0.5 > 5
for b in sdss_bands:
ind2 &= np.isfinite(tb4[b+'_FLUX'] / tb4[b+'_FLUXVAR']**0.5)
ind2 &= tb4[b+'_FLUX'] / tb4[b+'_FLUXVAR']**0.5 > 1
sel1 = np.where(ind1)[0]
sel2 = np.where(ind2)[0]
# remove duplicates
sel1 = sel1[~np.in1d(sel1, sel2)]
# shuffle, just in case
np.random.shuffle(sel1)
np.random.shuffle(sel2)
print(sel1.size, sel2.size) # should be about 10,000 each!
In [10]:
# Compare training and target: plot N(z) and imag SNR
normed = False
nbins = 40
fig, axs = plt.subplots(1, 2, figsize=(8, 3))
axs[0].hist(tb4['Z_BEST'][sel1], nbins, range=[0, 1.5], histtype='step', lw=2, label='Training', normed=normed)
axs[0].hist(tb4['Z_BEST'][sel2], nbins, range=[0, 1.5], histtype='step', lw=2, label='Target', normed=normed)
axs[0].legend()
axs[0].set_xlabel('Spectroscopic redshift')
axs[0].set_ylabel('$\mathrm{N}_\mathrm{gal}$')
b = 'I_SDSS'
nm = b + '_FLUX'
nme = b + '_FLUXVAR'
ratio1 = tb4[nm][sel1] / tb4[nme][sel1]**0.5
ratio1 = ratio1[np.isfinite(ratio1)]
ratio2 = tb4[nm][sel2] / tb4[nme][sel2]**0.5
ratio2 = ratio2[np.isfinite(ratio2)]
axs[1].hist(ratio1, nbins, histtype='step', lw=2, label='Training', range=[0, 80])
axs[1].hist(ratio2, nbins, histtype='step', lw=2, label='Target', range=[0, 80])
axs[1].legend()
axs[1].set_xlabel('$i$ band signal-to-noise ratio')
axs[1].set_ylabel('$\mathrm{N}_\mathrm{gal}$')
fig.tight_layout()
fig.savefig('../paper/training_vs_target.pdf')
In [13]:
# write a subset to ascii files
maxnum = 15000
names3 = list(np.concatenate([[nm + '_FLUX', nm + '_FLUXVAR'] for nm in bands])) + ['Z_BEST']
tb4[names3][sel1[:maxnum]].write('/Users/bl/Dropbox/repos/Delight/data/galaxies-fluxredshifts.txt',
format='ascii.no_header')
In [14]:
maxnum = 15000
tb4[names3][sel2[:maxnum]].write('/Users/bl/Dropbox/repos/Delight/data/galaxies-fluxredshifts2.txt',
format='ascii.no_header')
In [15]:
# pretty print band + band_var names, for delight input file
' '.join(list(np.concatenate([[nm, nm + '_var'] for nm in bands])) + ['redshift'])
Out[15]:
In [16]:
# write a smaller subset to ascii files
maxnum = 1000
names3 = list(np.concatenate([[nm + '_FLUX', nm + '_FLUXVAR'] for nm in bands])) + ['Z_BEST']
tb4[names3][sel1[:maxnum]].write('/Users/bl/Dropbox/repos/Delight/data/galaxies-fluxredshifts_small.txt',
format='ascii.no_header')
tb4[names3][sel2[:maxnum]].write('/Users/bl/Dropbox/repos/Delight/data/galaxies-fluxredshifts_small2.txt',
format='ascii.no_header')
In [17]:
# print band names
len(bands), ' '.join(bands)
Out[17]:
In [ ]: