PanSTARRS - WISE cross-match: Maximum Likelihood


In [1]:
import numpy as np
from astropy.table import Table, join
from astropy import units as u
from astropy.coordinates import SkyCoord, search_around_sky
import pickle
from tqdm import tnrange, tqdm_notebook

In [2]:
from IPython.display import clear_output
from mltier1 import parallel_process, SingleMLEstimator

In [3]:
%load_ext autoreload

In [4]:
%autoreload

In [5]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

Load the data

Load the catalogues


In [6]:
panstarrs = Table.read("panstarrs_u2.fits")

In [7]:
wise = Table.read("wise_u2.fits")

In [8]:
len(panstarrs)


Out[8]:
21650894

In [9]:
len(wise)


Out[9]:
13219700

Coordinates

As we will use the coordinates to make a cross-match we to load them


In [10]:
coords_panstarrs = SkyCoord(panstarrs['raMean'], panstarrs['decMean'], unit=(u.deg, u.deg), frame='icrs')

In [11]:
coords_wise = SkyCoord(wise['raWise'], wise['decWise'], unit=(u.deg, u.deg), frame='icrs')

Import the ML parameeters


In [12]:
bin_list, center, q0, n_m, q_m = pickle.load(open("pw_params.pckl", "rb"))

Define the main functions

We define the main functions used in the ML computing


In [13]:
likelihood_ratio = SingleMLEstimator(q0, n_m, q_m, center)

The following function could be used to get the sigma using the errors in the two optical catalogues but it may underestimate the error (not used)


In [14]:
def get_sigma(ra1, dec1, ra1err, dec1err, ra2, dec2, ra2err, dec2err):
    """Input positions in degrees.
    Errors in arcsecs
    Output in arcsecs
    """
    cosadj = np.cos(np.deg2rad(0.5*(dec1 + dec2)))
    phi = np.arctan2((dec2 - dec1), ((ra2 - ra1)*cosadj))
    sigma = np.pi - phi
    err1squared  = (ra1err * np.cos(sigma))**2 + (dec1err * np.sin(sigma))**2
    err2squared  = (ra2err * np.cos(phi))**2 + (dec2err * np.sin(phi))**2
    return np.sqrt(err1squared + err2squared)

Compute the ML ratio


In [15]:
radius = 15

In [16]:
idx_wise, idx_panstarrs, d2d, d3d = search_around_sky(
    coords_wise, coords_panstarrs, radius*u.arcsec)

In [17]:
idx_wise_unique = np.unique(idx_wise)

In [18]:
wise["lr"] = np.nan                   # Likelihood ratio
wise["lr_dist"] = np.nan              # Distance to the selected source
wise["lr_panstarrs_index"] = np.nan   # Index of the PanSTARRS source

In [19]:
total_sources = len(idx_wise_unique)

In [20]:
total_sources


Out[20]:
11539194

In [21]:
panstarrs_aux_index = np.arange(len(panstarrs))

In [22]:
def ml(i):
    idx_0 = idx_panstarrs[idx_wise == i]
    d2d_0 = d2d[idx_wise == i]
    i_mag = panstarrs["i"][idx_0]
#     sigma = get_sigma(wise["ra"][i], 
#                       wise["dec"][i], 
#                       wise["sigra"][i], 
#                       wise["sigdec"][i],
#                       panstarrs[idx_0]["raMean"][idx_0],
#                       panstarrs[idx_0]["decMean"][idx_0],
#                       panstarrs[idx_0]["raMeanErr"][idx_0],
#                       panstarrs[idx_0]["decMeanErr"][idx_0],
#              )
    sigma = 1.
    lr_0 = likelihood_ratio(i_mag, d2d_0.arcsec, sigma)
    chosen_index = np.argmax(lr_0)
    
    result = [panstarrs_aux_index[idx_0[chosen_index]], # Index
              (d2d_0.arcsec)[chosen_index],             # distance
              lr_0[chosen_index]]                       # LR
    
    return result

In [ ]:
step_size = 100000
nsteps = total_sources//step_size + 1
res = []
for k in tnrange(nsteps, desc="Blocks"):
    low_limit = k*step_size
    high_limit = (k+1)*step_size
    res += parallel_process(idx_wise_unique[low_limit:high_limit], 
                            ml, 
                            n_jobs=8,
                            notebook=True)

In [24]:
len(res)


Out[24]:
11539194

In [25]:
(wise["lr_panstarrs_index"][idx_wise_unique], 
 wise["lr_dist"][idx_wise_unique], 
 wise["lr"][idx_wise_unique]) = list(map(list, zip(*res)))

Plot the ML results

The auxiliary column "lr_pc" will have a 0 value for all non-matches and will be used to get the threshold in the ML parameter.


In [26]:
wise["lr_pc"] = wise["lr"]
wise["lr_pc"][np.isnan(wise["lr_pc"])] = 0

In [27]:
threshold = np.percentile(wise["lr_pc"], 100*(1 - q0))

In [28]:
threshold


Out[28]:
6.4306303253655415

In [29]:
plt.rcParams["figure.figsize"] = (15,6)
subplot(1,2,1)
hist(wise[~np.isnan(wise["lr_pc"])]["lr_pc"], bins=200)
vlines([threshold], 0, 150000)
ylim([0,150000])
subplot(1,2,2)
hist(np.log10(wise[~np.isnan(wise["lr_pc"])]["lr_pc"]+1), bins=200)
vlines(np.log10(threshold+1), 0, 150000)
ylim([0,150000]);


Combine data

First we discard the matches below the threshold


In [30]:
wise["lr_index"] = wise["lr_panstarrs_index"]
wise["lr_index"][wise["lr_pc"] < threshold] = np.nan

We combine the two catalogues using an outer join wich maintains all the data for the two catalogues


In [31]:
panstarrs["lr_index"] = np.arange(len(panstarrs)).astype(float)

In [32]:
combined = join(wise, panstarrs, join_type='outer', keys='lr_index')

Save catalogue


In [33]:
combined['ra'] = combined['raMean']
combined['dec'] = combined['decMean']
combined['raErr'] = combined['raMeanErr']
combined['decErr'] = combined['decMeanErr']

combined['ra'][np.isnan(combined['raMean'])] = combined['raWise'][np.isnan(combined['raMean'])]
combined['dec'][np.isnan(combined['decMean'])] = combined['decWise'][np.isnan(combined['decMean'])]
combined['raErr'][np.isnan(combined['raMean'])] = combined['raWiseErr'][np.isnan(combined['raMean'])]
combined['decErr'][np.isnan(combined['decMean'])] = combined['decWiseErr'][np.isnan(combined['decMean'])]

Important step to solve problems with the default values for the columns. Check later why we have to do that.


In [34]:
for col in ["raMean", "decMean", "raMeanErr", "decMeanErr",
            "raWise", "decWise", "raWiseErr", "decWiseErr",
            "ra", "dec", "raErr", "decErr"]:
    combined[col].fill_value = 1e+20

In [35]:
columns_save = ['AllWISE', 'objID', 'ra', 'dec', 'raErr', 'decErr', 
                'W1mag', 'W1magErr', 'i', 'iErr']

In [36]:
combined[columns_save].write('pw.fits', format="fits")

We can also save a version with all the data


In [37]:
np.array(combined.colnames)


Out[37]:
array(['AllWISE', 'raWise', 'decWise', 'raWiseErr', 'decWiseErr', 'W1mag',
       'W1magErr', 'lr', 'lr_dist', 'lr_panstarrs_index', 'lr_pc',
       'lr_index', 'objID', 'raMean', 'decMean', 'raMeanErr', 'decMeanErr',
       'i', 'iErr', 'ra', 'dec', 'raErr', 'decErr'],
      dtype='<U18')

In [38]:
combined.write('pw_lrdata.fits', format="fits")

In [ ]: