The goal of this notebook is to use noiseless spectroscopic simulations from desitarget.select_mock_targets
to calibrate a set of Gaussian mixture models (GMMs) which can be used in turn to simulate much larger datasets just at the catalog level, i.e., quicksurvey.
In [1]:
import os
import warnings
import numpy as np
from pkg_resources import resource_filename
In [2]:
from astropy.table import Table
from sklearn.mixture import GaussianMixture as GMM
from sklearn.model_selection import train_test_split
In [3]:
from desiutil.sklearn import GaussianMixtureModel
from desitarget.targetmask import desi_mask
In [4]:
import matplotlib.pyplot as plt
import corner as cn
In [5]:
%matplotlib inline
In [6]:
seed = 1
rand = np.random.RandomState(seed)
overwrite = True
In [7]:
def get_mog_bic(X, ncomp, rand=None):
"""Compute the MoG and BIC for a range of Gaussian components."""
mog = [GMM(n_components=nc, random_state=rand).fit(X) for nc in ncomp]
bic = [_mog.bic(X) for _mog in mog]
return mog, np.array(bic)
In [8]:
def qa_bic(ncomp, bic, title='Object', png=None):
ncompbest = ncomp[np.argmin(bic)]
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(ncomp, bic / 100, marker='s', ls='-')
ax.set_xlabel('Number of Gaussian Components')
ax.set_ylabel('Bayesian Information Criterion / 100')
ax.set_title('{}: Optimal number of components = {:d}'.format(title, ncompbest))
if png:
plt.savefig(png)
In [9]:
def build_gmm(X, ncompmin=1, ncompmax=5, target='',
rand=None, png=None, overwrite=False):
"""Find the optimal GMM."""
if rand is None:
rand = np.random.RandomState()
ncomp = np.arange(ncompmin, ncompmax)
gmmfile = resource_filename('desitarget', 'mock/data/quicksurvey_gmm_{}.fits'.format(
target.lower()))
if ~os.path.isfile(gmmfile) or overwrite:
print('Generating a GMM for {}s with N={}-{} components from {} objects.'.format(
target, ncompmin, ncompmax, X.shape[0]))
allmog, bic = get_mog_bic(X, ncomp, rand=rand)
qa_bic(ncomp, bic, png=png, title=target)
print('Writing {}'.format(gmmfile))
mog = allmog[np.argmin(bic)] # minimize the BIC
GaussianMixtureModel.save(mog, gmmfile)
# (Re)read the model to get a few more convenience methods.
print('Reading {}'.format(gmmfile))
mog = GaussianMixtureModel.load(gmmfile)
return mog
In [10]:
def read_targets_truth():
refdir = os.path.join(os.getenv('DESI_ROOT'), 'datachallenge',
'reference_runs', '18.3', 'targets')
truthfile = os.path.join(refdir, 'truth.fits')
targetsfile = os.path.join(refdir, 'targets.fits')
truth = Table.read(truthfile)#.to_pandas()
print('Read {} objects from {}'.format(len(truth), truthfile))
targets = Table.read(targetsfile)#.to_pandas()
print('Read {} objects from {}'.format(len(targets), targetsfile))
return targets, truth
In [11]:
targets, truth = read_targets_truth()
targets
Out[11]:
In [12]:
truth
Out[12]:
In [13]:
def get_data(target='ELG', dataframe=False, Nmax=None):
"""Build the data matrix."""
from desitarget.targetmask import desi_mask, mws_mask, bgs_mask
if target == 'LYA':
indx = ( ((targets['DESI_TARGET'].data & desi_mask.QSO) != 0) *
(np.char.strip(truth['TEMPLATESUBTYPE']) == 'LYA') )
else:
if 'BGS' in target:
indx = ( ((targets['DESI_TARGET'] & desi_mask.BGS_ANY) != 0) *
(np.char.strip(truth['TEMPLATETYPE']) == 'BGS') )
else:
indx = ( ((targets['DESI_TARGET'] & desi_mask.mask(target)) != 0) *
(np.char.strip(truth['TEMPLATETYPE']) == target) )
with warnings.catch_warnings():
warnings.simplefilter('ignore')
gmag = 22.5 - 2.5 * np.log10(truth['FLUX_G'][indx])
rmag = 22.5 - 2.5 * np.log10(truth['FLUX_R'][indx])
zmag = 22.5 - 2.5 * np.log10(truth['FLUX_Z'][indx])
gr = - 2.5 * np.log10(truth['FLUX_G'][indx] / truth['FLUX_R'][indx])
rz = - 2.5 * np.log10(truth['FLUX_R'][indx] / truth['FLUX_Z'][indx])
zW1 = - 2.5 * np.log10(truth['FLUX_Z'][indx] / truth['FLUX_W1'][indx])
W1W2 = - 2.5 * np.log10(truth['FLUX_W1'][indx] / truth['FLUX_W2'][indx])
oii = np.log10(1e17 * truth['OIIFLUX'][indx])
if dataframe:
import pandas as pd
df = pd.DataFrame()
df['g - r'] = gr
df['r - z'] = rz
df['z - W1'] = zW1
df['W1 - W2'] = W1W2
if target == 'ELG':
df['r'] = rmag
df['[OII]'] = oii
elif target == 'LRG':
df['z'] = zmag
elif target == 'QSO' or target == 'LYA':
df['g'] = gmag
if Nmax:
df = df[:Nmax]
return df
else:
nobj = len(indx)
X = np.vstack( (gr, rz, zW1, W1W2) )
if target == 'ELG':
X = np.vstack( (rmag, X, oii) )
elif target == 'BGS':
X = np.vstack( (rmag, X) )
elif target == 'LRG':
X = np.vstack( (zmag, X) )
elif target == 'QSO':
X = np.vstack( (gmag, X) )
elif target == 'LYA':
X = np.vstack( (gmag, X) )
if Nmax:
X = X[:, :Nmax]
return X.T
In [14]:
def qa_corner(Xdata, Xsample, target=''):
if target == 'ELG':
labels = ('r', 'g - r', 'r - z', 'z - W1', 'W1 - W2', r'log$_{10}$ [OII]-17')
elif target == 'BGS':
labels = ('r', 'g - r', 'r - z', 'z - W1', 'W1 - W2')
elif target == 'LRG':
labels = ('z', 'g - r', 'r - z', 'z - W1', 'W1 - W2')
elif target == 'QSO':
labels = ('g', 'g - r', 'r - z', 'z - W1', 'W1 - W2')
elif target == 'LYA':
labels = ('g', 'g - r', 'r - z', 'z - W1', 'W1 - W2')
fig = cn.corner(Xdata, labels=labels, label_kwargs={"fontsize": 14},
show_titles=True, title_kwargs={"fontsize": 12},
color='k')
fig.suptitle('{} Validation Data (grayscale) + GMM Samples (green points)'.format(target.upper()))
nobj, ndim = Xdata.shape
axes = np.array(fig.axes).reshape((ndim, ndim))
for yi in range(ndim):
for xi in range(yi):
ax = axes[yi, xi]
ax.scatter(Xsample[:, xi], Xsample[:, yi], marker='s',
color='g', s=5, alpha=0.5)
In [15]:
def train_and_validate(target, Nsample=500, overwrite=False):
X = get_data(target.upper(), Nmax=20000)
Xtrain, Xvalidate = train_test_split(X, test_size=0.5, random_state=rand)
mog = build_gmm(Xtrain, ncompmin=3, ncompmax=50, overwrite=overwrite,
target=target, rand=rand, png=None)
Xsample = mog.sample(Nsample, random_state=rand)
qa_corner(Xvalidate, Xsample, target=target)
return mog
In [16]:
%time mog = train_and_validate('BGS', Nsample=500, overwrite=overwrite)
In [17]:
%time mog = train_and_validate('LYA', Nsample=500, overwrite=overwrite)
In [18]:
%time mog = train_and_validate('QSO', Nsample=500, overwrite=overwrite)
In [19]:
%time mog = train_and_validate('LRG', Nsample=500, overwrite=overwrite)
In [20]:
%time mog = train_and_validate('ELG', Nsample=500, overwrite=overwrite)
In [ ]: