Calibrating Quicksurvey using Gaussian Mixture Models

The goal of this notebook is to use noiseless spectroscopic simulations from desitarget.select_mock_targets to calibrate a set of Gaussian mixture models (GMMs) which can be used in turn to simulate much larger datasets just at the catalog level, i.e., quicksurvey.


In [1]:
import os
import warnings
import numpy as np
from pkg_resources import resource_filename

In [2]:
from astropy.table import Table
from sklearn.mixture import GaussianMixture as GMM
from sklearn.model_selection import train_test_split

In [3]:
from desiutil.sklearn import GaussianMixtureModel
from desitarget.targetmask import desi_mask

In [4]:
import matplotlib.pyplot as plt
import corner as cn

In [5]:
%matplotlib inline

Simulation variables


In [6]:
seed = 1
rand = np.random.RandomState(seed)
overwrite = True

Define some convenience functions for doing the Gaussian mixture modeling.


In [7]:
def get_mog_bic(X, ncomp, rand=None):
    """Compute the MoG and BIC for a range of Gaussian components."""
    mog = [GMM(n_components=nc, random_state=rand).fit(X) for nc in ncomp]
    bic = [_mog.bic(X) for _mog in mog]
    return mog, np.array(bic)

In [8]:
def qa_bic(ncomp, bic, title='Object', png=None):
    ncompbest = ncomp[np.argmin(bic)]
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.plot(ncomp, bic / 100, marker='s', ls='-')
    ax.set_xlabel('Number of Gaussian Components')
    ax.set_ylabel('Bayesian Information Criterion / 100')
    ax.set_title('{}: Optimal number of components = {:d}'.format(title, ncompbest))
    if png:
        plt.savefig(png)

In [9]:
def build_gmm(X, ncompmin=1, ncompmax=5, target='', 
              rand=None, png=None, overwrite=False):
    """Find the optimal GMM."""

    if rand is None:
        rand = np.random.RandomState()
    ncomp = np.arange(ncompmin, ncompmax)

    gmmfile = resource_filename('desitarget', 'mock/data/quicksurvey_gmm_{}.fits'.format(
        target.lower()))
    if ~os.path.isfile(gmmfile) or overwrite:
        print('Generating a GMM for {}s with N={}-{} components from {} objects.'.format(
            target, ncompmin, ncompmax, X.shape[0]))
        allmog, bic = get_mog_bic(X, ncomp, rand=rand)
        qa_bic(ncomp, bic, png=png, title=target)    
    
        print('Writing {}'.format(gmmfile))
        mog = allmog[np.argmin(bic)] # minimize the BIC
        GaussianMixtureModel.save(mog, gmmfile)

    # (Re)read the model to get a few more convenience methods.
    print('Reading {}'.format(gmmfile))
    mog = GaussianMixtureModel.load(gmmfile)

    return mog

Read the reference truth catalog.


In [10]:
def read_targets_truth():
    refdir = os.path.join(os.getenv('DESI_ROOT'), 'datachallenge', 
                          'reference_runs', '18.3', 'targets')
    truthfile = os.path.join(refdir, 'truth.fits')
    targetsfile = os.path.join(refdir, 'targets.fits')

    truth = Table.read(truthfile)#.to_pandas()
    print('Read {} objects from {}'.format(len(truth), truthfile))

    targets = Table.read(targetsfile)#.to_pandas()
    print('Read {} objects from {}'.format(len(targets), targetsfile))
    
    return targets, truth

In [11]:
targets, truth = read_targets_truth()
targets


Read 240902 objects from /Users/ioannis/work/desi/datachallenge/reference_runs/18.3/targets/truth.fits
Read 240902 objects from /Users/ioannis/work/desi/datachallenge/reference_runs/18.3/targets/targets.fits
Out[11]:
<Table length=240902>
BRICKIDBRICKNAMEBRICK_OBJIDRADECFLUX_GFLUX_RFLUX_ZFLUX_W1FLUX_W2MW_TRANSMISSION_GMW_TRANSMISSION_RMW_TRANSMISSION_ZMW_TRANSMISSION_W1MW_TRANSMISSION_W2PSFDEPTH_GPSFDEPTH_RPSFDEPTH_ZGALDEPTH_GGALDEPTH_RGALDEPTH_ZPSFDEPTH_W1PSFDEPTH_W2SHAPEDEV_RSHAPEDEV_E1SHAPEDEV_E2SHAPEEXP_RSHAPEEXP_E1SHAPEEXP_E2SUBPRIORITYTARGETIDDESI_TARGETBGS_TARGETMWS_TARGETHPXPIXEL
int32str8int32float64float64float32float32float32float32float32float32float32float32float32float32float32float32float32float32float32float32float32float32float32float32float32float32float32float32float64int64int64int64int64int64
52611526p2820152.76292419428.31258392334.8741912.402221.592916.472510.94740.9401070.959250.976810.996470.99783152.48077.726811.8706857.54413.18262.511890.02767320.438591-0.000956611-0.0003142080.0005938370.4524960.00100628-0.001022340.35681035594228823039821794508811529215046068469766553700
52611524p2801152.56385803228.102775573721.616737.297255.960251.405438.10550.930630.9527260.9730570.9958930.99747652.48077.726811.8706857.54413.18262.511890.02767320.4385913.157210.0556914-0.0006845530.866786-0.106927-0.02301060.568604199297288230398217945089115292150460684697613107400
52611526p2852152.63650512728.547132492118.822748.33684.638176.405451.42960.9481480.9647690.9799760.9969560.9981352.48077.726811.8706857.54413.18262.511890.02767320.43859153.47240.164367-0.08890530.393993-0.256706-0.01003870.93292545864288230398217945090115292150460684697613107400
52611523p2823152.45303344728.322896957417.327469.1105146.392176.067125.940.9397460.9590020.9766670.9964490.99781752.48077.726811.8706857.54413.18262.511890.02767320.4385910.38233-0.17208-0.170571-0.000191608-0.00332903-0.003308820.588079222788288230398217945091115292150460684697613107400
52611529p2824152.82136535628.3377704621.970059.6219521.381835.962627.76750.937110.9571890.9756260.9962880.99771952.48077.726811.8706857.54413.18262.511890.02767320.438591-7.74865e-050.002077730.0005942461.116040.119471-0.2213810.17543431523828823039821794509211529215046068469766553700
52611526p2825152.51086425828.292882919310.709422.821334.59132.846426.51920.9391850.9586160.9764460.9964140.99779652.48077.726811.8706857.54413.18262.511890.02767320.4385911.983-0.268770.390211-0.0002069435.04955e-054.57717e-050.365531304502288230398217945093115292150460684697613107400
52611526p2826152.76393127428.16834640514.240337.631371.891977.245754.48910.9313390.9532140.9733390.9959360.99750252.48077.726811.8706857.54413.18262.511890.02767320.4385910.0007775390.001660610.0003513652.592960.0972936-0.3126240.951183062125288230398217945094115292150460684697613107400
52611526p2857152.55876159728.39123916633.4756910.901721.269829.089520.2450.9422180.96070.9776430.9965980.9979152.48077.726811.8706857.54413.18262.511890.02767320.438591-0.0005561910.000258831-0.0009254251.465120.1812070.214680.16704683354628823039821794509511529215046068469766553700
52611526p2828152.77514648428.18267440814.951165.0984145.076204.618152.0960.9317530.95350.9735030.9959610.99751852.48077.726811.8706857.54413.18262.511890.02767320.4385910.00235169-0.000887671-0.0004311172.068470.08831140.1047880.680662990024288230398217945096115292150460684697613107400
52611526p2829152.74696350128.35470962524.5832314.586128.678429.362320.7640.9395780.9588870.9766010.9964380.99781152.48077.726811.8706857.54413.18262.511890.02767320.438591-0.000417875-0.000383122-0.0002524182.063990.0790488-0.4263180.53303319696828823039821794509711529215046068469766553700
.........................................................................................................
56541496p3604978149.50962829636.0923004151.503161.981331.783432.813746.306960.9709030.9803060.9888450.9983110.99896252.48077.726811.8706857.54413.18262.511890.02767320.4385910.00.00.00.00.00.00.36668922993288230399866311538262148000
56541496p3604979149.76942443836.00520324712.007122.038852.32.835217.798550.9756750.9835490.9906880.9985910.99913552.48077.726811.8706857.54413.18262.511890.02767320.4385910.00.00.00.00.00.00.134315603409288230399866311539262148000
56541498p3624980149.9043579136.20254898074.274535.815195.6488512.074420.12660.9740920.9824730.9900770.9984980.99907752.48077.726811.8706857.54413.18262.511890.02767320.4385910.00.00.00.00.00.00.347344018143288230399866311540262148000
56541498p3524981149.93666960235.312990553840.265826.281314.38773.350410.01296260.9672650.977830.9874360.9980960.9988352.48077.726811.8706857.54413.18262.511890.02767320.4385910.00.00.00.00.00.00.6317641463662882303998663115412305843026393563136020
56541498p3554982149.89114810735.438355854813.74919.869815.589921.084080.6447390.9675340.9780130.987540.9981120.9988452.48077.726811.8706857.54413.18262.511890.02767320.4385910.00.00.00.00.00.00.3371119008872882303998663115422305843009213693952020
56541495p3524983149.69067802135.280059385211.851715.299716.04014.283482.334620.9775220.9848020.99140.9986990.99920152.48077.726811.8706857.54413.18262.511890.02767320.4385910.00.00.00.00.00.00.1154579910262882303998663115432305843009213693952020
56541498p3524984150.0116451635.283350778211.04199.193516.2738235.388363.72570.9653150.9765010.9866790.9979810.9987652.48077.726811.8706857.54413.18262.511890.02767320.4385910.00.00.00.00.00.00.2002665537022882303998663115442305843009213693952020
56541497p3574985149.66975320635.653774715321.451130.286233.80477.889514.986120.9754590.9834020.9906050.9985780.99912752.48077.726811.8706857.54413.18262.511890.02767320.4385910.00.00.00.00.00.00.3239154367552882303998663115452305843009213693952020
56541492p3554986149.32280318335.50265783111.844915.25815.69711.964081.877810.9715550.9807490.9890970.9983490.99898652.48077.726811.8706857.54413.18262.511890.02767320.4385910.00.00.00.00.00.00.7278300098972882303998663115462305843009213693952020
56541492p3554987149.32341047235.502562712511.799115.106215.53383.170262.143590.9715580.9807510.9890980.9983490.99898652.48077.726811.8706857.54413.18262.511890.02767320.4385910.00.00.00.00.00.00.2947943720922882303998663115472305843009213693952020

In [12]:
truth


Out[12]:
<Table length=240902>
TARGETIDMOCKIDCONTAM_TARGETTRUEZTRUESPECTYPETEMPLATETYPETEMPLATESUBTYPETEMPLATEIDSEEDMAGVDISPFLUX_GFLUX_RFLUX_ZFLUX_W1FLUX_W2OIIFLUXHBETAFLUXTEFFLOGGFEH
int64int64int64float32str10str10str10int32int64float32float32float32float32float32float32float32float32float32float32float32float32
2882303982179450881559431200.134829GALAXYBGS3627148285763219.810461.53635.2449613.027122.072317.045611.0778-1.00.0-1.0-1.0-1.0
2882303982179450891971350600.131031GALAXYBGS3100184751632418.553661.536323.236539.164357.429952.143738.0304-1.01.62485e-15-1.0-1.0-1.0
2882303982179450901792872600.145801GALAXYBGS1281115918852618.351461.536319.850150.108486.518174.636251.4383-1.00.0-1.0-1.0-1.0
2882303982179450911976207900.295097GALAXYBGS5168166423978417.993261.536318.452471.9985149.989178.462126.497-1.00.0-1.0-1.0-1.0
2882303982179450921397908900.404595GALAXYBGS607723658221220.154761.53632.099789.9580921.838936.39628.0412-1.00.0-1.0-1.0-1.0
2882303982179450931645920000.264106GALAXYBGS1126199116253719.117661.536311.386623.789435.423933.640526.4355-1.06.34972e-16-1.0-1.0-1.0
2882303982179450941852243400.124415GALAXYBGS543126639481818.612461.536315.30539.514973.938579.356554.7434-1.01.20094e-16-1.0-1.0-1.0
2882303982179450951784331600.32049GALAXYBGS89146563407619.959361.53633.7376211.39821.868327.329420.5494-1.08.81773e-17-1.0-1.0-1.0
2882303982179450961644018700.294273GALAXYBGS3795171574794818.060361.536316.036468.1686149.188206.879152.196-1.00.0-1.0-1.0-1.0
2882303982179450971296843100.206353GALAXYBGS67411112988619.658761.53634.8360115.321729.313529.373420.7506-1.00.0-1.0-1.0-1.0
...............................................................
288230399866311538129845501.26878QSOQSO1238403679522.0308-1.01.540591.982641.961873.590396.32289-1.0-1.0-1.0-1.0-1.0
288230399866311539123092701.64302QSOQSO2238403679521.7264-1.02.03922.036552.255734.232497.84327-1.0-1.0-1.0-1.0-1.0
288230399866311540120554301.38468QSOQSO3238403679520.8983-1.04.372045.870575.6954212.156620.079-1.0-1.0-1.0-1.0-1.0
2882303998663115413368704.73661e-05WDWDDA963207035011018.422-1.041.62726.809614.52541.298990.688422-1.0-1.027000.08.5-1.0
28823039986631154238968200.000110076WDWDDA74151854322219.612-1.014.16910.14286.004720.6525230.353166-1.0-1.016750.08.0-1.0
288230399866311543860080-0.000210479WDWDDA27180067934019.832-1.012.156515.529315.96023.606372.02437-1.0-1.06500.08.25-1.0
28823039986631154438952400.000193467WDWDDB22212715190019.843-1.011.42939.425636.3804634.048163.9958-1.0-1.012000.08.0-1.0
2882303998663115453899640-3.33564e-06WDWDDA4131407324219.202-1.021.976630.879634.14738.658434.92947-1.0-1.06000.08.0-1.0
288230399866311546860730-5.00346e-06WDWDDA27145846440319.832-1.012.159715.532215.97373.608332.02574-1.0-1.06500.08.25-1.0
288230399866311547860720-1.10076e-05WDWDDA2758368567819.832-1.012.159615.532115.97343.608282.0257-1.0-1.06500.08.25-1.0

Define some wrapper functions for reading the data and generating QA.


In [13]:
def get_data(target='ELG', dataframe=False, Nmax=None):
    """Build the data matrix."""
    from desitarget.targetmask import desi_mask, mws_mask, bgs_mask
    
    if target == 'LYA':
        indx = ( ((targets['DESI_TARGET'].data & desi_mask.QSO) != 0) *
                 (np.char.strip(truth['TEMPLATESUBTYPE']) == 'LYA') )
    else:
        if 'BGS' in target:
            indx = ( ((targets['DESI_TARGET'] & desi_mask.BGS_ANY) != 0) *
                    (np.char.strip(truth['TEMPLATETYPE']) == 'BGS') )
        else:
            indx = ( ((targets['DESI_TARGET'] & desi_mask.mask(target)) != 0) *
                    (np.char.strip(truth['TEMPLATETYPE']) == target) )
                
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        gmag = 22.5 - 2.5 * np.log10(truth['FLUX_G'][indx])
        rmag = 22.5 - 2.5 * np.log10(truth['FLUX_R'][indx])
        zmag = 22.5 - 2.5 * np.log10(truth['FLUX_Z'][indx])
        gr = - 2.5 * np.log10(truth['FLUX_G'][indx] / truth['FLUX_R'][indx])
        rz = - 2.5 * np.log10(truth['FLUX_R'][indx] / truth['FLUX_Z'][indx])
        zW1 = - 2.5 * np.log10(truth['FLUX_Z'][indx] / truth['FLUX_W1'][indx])
        W1W2 = - 2.5 * np.log10(truth['FLUX_W1'][indx] / truth['FLUX_W2'][indx])
        oii = np.log10(1e17 * truth['OIIFLUX'][indx])
        
    if dataframe:
        import pandas as pd
        df = pd.DataFrame()
        df['g - r'] = gr
        df['r - z'] = rz
        df['z - W1'] = zW1
        df['W1 - W2'] = W1W2
        if target == 'ELG':
            df['r'] = rmag
            df['[OII]'] = oii
        elif target == 'LRG':
            df['z'] = zmag
        elif target == 'QSO' or target == 'LYA':
            df['g'] = gmag
            
        if Nmax:
            df = df[:Nmax]
            
        return df
    else:
        nobj = len(indx)
        X = np.vstack( (gr, rz, zW1, W1W2) )
        if target == 'ELG':
            X = np.vstack( (rmag, X, oii) )
        elif target == 'BGS':
            X = np.vstack( (rmag, X) )
        elif target == 'LRG':
            X = np.vstack( (zmag, X) )
        elif target == 'QSO':
            X = np.vstack( (gmag, X) )
        elif target == 'LYA':
            X = np.vstack( (gmag, X) )

        if Nmax:
            X = X[:, :Nmax]
            
        return X.T

In [14]:
def qa_corner(Xdata, Xsample, target=''):
    if target == 'ELG':
        labels = ('r', 'g - r', 'r - z', 'z - W1', 'W1 - W2', r'log$_{10}$ [OII]-17')
    elif target == 'BGS':
        labels = ('r', 'g - r', 'r - z', 'z - W1', 'W1 - W2')
    elif target == 'LRG':
        labels = ('z', 'g - r', 'r - z', 'z - W1', 'W1 - W2')
    elif target == 'QSO':
        labels = ('g', 'g - r', 'r - z', 'z - W1', 'W1 - W2')
    elif target == 'LYA':
        labels = ('g', 'g - r', 'r - z', 'z - W1', 'W1 - W2')
    
    fig = cn.corner(Xdata, labels=labels, label_kwargs={"fontsize": 14}, 
                    show_titles=True, title_kwargs={"fontsize": 12},
                    color='k')
    fig.suptitle('{} Validation Data (grayscale) + GMM Samples (green points)'.format(target.upper()))
    
    nobj, ndim = Xdata.shape
    axes = np.array(fig.axes).reshape((ndim, ndim))
    for yi in range(ndim):
        for xi in range(yi):
            ax = axes[yi, xi]
            ax.scatter(Xsample[:, xi], Xsample[:, yi], marker='s', 
                       color='g', s=5, alpha=0.5)

In [15]:
def train_and_validate(target, Nsample=500, overwrite=False):
    X = get_data(target.upper(), Nmax=20000)
    Xtrain, Xvalidate = train_test_split(X, test_size=0.5, random_state=rand)
    
    mog = build_gmm(Xtrain, ncompmin=3, ncompmax=50, overwrite=overwrite, 
                    target=target, rand=rand, png=None)
    
    Xsample = mog.sample(Nsample, random_state=rand)
    qa_corner(Xvalidate, Xsample, target=target)
    
    return mog

BGS


In [16]:
%time mog = train_and_validate('BGS', Nsample=500, overwrite=overwrite)


Generating a GMM for BGSs with N=3-50 components from 10000 objects.
Writing /Users/ioannis/repos/desihub/desitarget/py/desitarget/mock/data/quicksurvey_gmm_bgs.fits
Reading /Users/ioannis/repos/desihub/desitarget/py/desitarget/mock/data/quicksurvey_gmm_bgs.fits
CPU times: user 5min 24s, sys: 5.63 s, total: 5min 30s
Wall time: 1min 24s

LYAs

Train a GMM on g, g-r, r-z, z-W1, and W1-W2.


In [17]:
%time mog = train_and_validate('LYA', Nsample=500, overwrite=overwrite)


Generating a GMM for LYAs with N=3-50 components from 1044 objects.
Writing /Users/ioannis/repos/desihub/desitarget/py/desitarget/mock/data/quicksurvey_gmm_lya.fits
Reading /Users/ioannis/repos/desihub/desitarget/py/desitarget/mock/data/quicksurvey_gmm_lya.fits
CPU times: user 47.3 s, sys: 348 ms, total: 47.6 s
Wall time: 12.9 s

QSOs

Train a GMM on g, g-r, r-z, z-W1, and W1-W2.


In [18]:
%time mog = train_and_validate('QSO', Nsample=500, overwrite=overwrite)


Generating a GMM for QSOs with N=3-50 components from 3444 objects.
Writing /Users/ioannis/repos/desihub/desitarget/py/desitarget/mock/data/quicksurvey_gmm_qso.fits
Reading /Users/ioannis/repos/desihub/desitarget/py/desitarget/mock/data/quicksurvey_gmm_qso.fits
CPU times: user 2min 30s, sys: 1.83 s, total: 2min 32s
Wall time: 39.1 s