False Positive predictions for single/double-transit Kepler signals

The following analysis was executed using commit e4f54288 of the exosyspop module.

Setup


In [1]:
%matplotlib inline
from __future__ import print_function, division

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Set up plot stuff the DFM way:
%run ../peerless/plot_setup.py

# Retrieve the target catalog
!../scripts/peerless-targets

from peerless.catalogs import TargetCatalog, EBCatalog, KICatalog
targets = TargetCatalog().df
targets.index = targets.kepid

# Some targets have missing masses.  Let's artificially add those in, with M=R
bad = targets.mass.isnull()
targets.loc[bad, 'mass'] = targets.loc[bad, 'radius']

# Calculate galactic latitude of target stars, for later use.
from astropy.coordinates import SkyCoord
c = SkyCoord(targets.ra, targets.dec, unit='deg')
targets['b'] = c.galactic.b.deg

# Download (or load) BG star simulation of the middle of the Kepler field, using TRILEGAL, via vespa
try:
    bgstars = pd.read_hdf('bgstars.h5', 'df')
except:
    from vespa.stars.trilegal import get_trilegal    
    get_trilegal('bgstars.h5', 290.6654895, 44.48410232)
    bgstars = pd.read_hdf('bgstars.h5', 'df')


Saving 39036 targets to /Users/tdm/peerless/catalogs/targets.csv

Calibrate EB population parameters to match Kepler EB catalog


In [2]:
from exosyspop.populations import KeplerPowerLawBinaryPopulation

# Initialize population object
ebpop = KeplerPowerLawBinaryPopulation(targets)

# These parameters have been found to work well
min_period = 20
max_period = 25*365.25
fB = 0.25
beta = -1.0
ebpop.set_params(beta=beta, period_min=min_period, period_max=max_period, fB=fB)

# Query to emulate Kepler EB selection: less than 1000 days;
#  both primary and secondary eclipse found, or greater than two of either;
#  duration less than 2.5 days.
q = 'period > {} and period < 1000 and '.format(min_period) +\
    '((n_pri > 0 and n_sec > 0) or ' +\
     '(n_pri > 2 or n_sec > 2)) and (T14_pri < 2.5 or T14_sec < 2.5)'

kep_ebs = EBCatalog().df.query('{} < period < 1000'.format(min_period))
n_exp = len(kep_ebs) * (len(targets)/len(KICatalog().df))
print(n_exp)
plt.hist(np.log10(kep_ebs.period), 
         histtype='step', lw=3, normed=True, label='Observed');
new_obs = ebpop.observe(new_orbits=True).query(q)
plt.hist(np.log10(new_obs.period), label='Simulated',
        histtype='step', lw=3, normed=True);
plt.legend()
print(len(new_obs))


136.401367662
EB: dmag regressor trained, R2=0.999481037605
EB: qR regressor trained, R2=0.999475105147
145
/Users/tdm/anaconda/lib/python2.7/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [3]:
ebpop.params


Out[3]:
{'beta': -1.0,
 'beta_a': 0.8,
 'beta_b': 2.0,
 'fB': 0.25,
 'gamma': 0.3,
 'period_max': 9131.25,
 'period_min': 20,
 'qmin': 0.1}

OK, the above parameters qualitatively match the period distribution well, what about the total number of detections? Let's do a bunch of these observation simulations and confirm it looks good:


In [4]:
n_obs = [len(ebpop.observe(new_orbits=True).query(q)) for i in range(100)]

plt.hist(n_obs, histtype='step', lw=3, bins=10, color='k');
plt.axvline(n_exp, color='r', ls='--', lw=3);


Define EB and BEB populations with these parameters.

Now that we can roughly match the Kepler EB catalog from 20 to 1000d with these EB population parameters, we want to know what's the typical number of single/double eclipse signals.


In [5]:
# Define a BEB population
from exosyspop.populations import TRILEGAL_BGPowerLawBinaryPopulation

ebpop = KeplerPowerLawBinaryPopulation(targets)
bgpop = TRILEGAL_BGPowerLawBinaryPopulation(targets, bgstars, use_ic=True)

In [6]:
# Set params to match above
pars = dict(period_min=min_period, period_max=max_period, beta=beta, fB=fB)
ebpop.set_params(**pars)
bgpop.set_params(**pars)

## Just 
#ebpop._generate_binaries()
#bgpop._generate_binaries()

In [7]:
# Train the regressions; calling explicitly allows to do this with N=1000 rather than N=10000
ebpop._train_trap(N=1000);
bgpop._train_trap(N=1000);


EB: dmag regressor trained, R2=0.999500036883
EB: qR regressor trained, R2=0.99947245989
EB: Depth trained: R2=0.999702660929
EB: Duration trained: R2=0.998343729032
EB: Slope trained: R2=0.983507073922
BGEB: Depth trained: R2=0.998040263779
BGEB: Duration trained: R2=0.992783188554
BGEB: Slope trained: R2=0.989069626741

Simulate detected populations of EBs and BEBs

Here, we simulate how many EBs and BEBs will be typically detected with

  • only primary OR secondary eclipses showing
  • fewer than 3 eclipses
  • SNR > 15
  • duration < 2.5d

In [8]:
n_sim = 100
dfm_q = 'n_pri < 3 and n_sec < 3 and ' +\
        '(((n_pri==0 or snr_pri < 15) and snr_sec > 15) or ' +\
        '((n_sec==0 or snr_sec < 15)  and snr_pri > 15))'
dfm_q += ' and ((T14_pri < 2.5 and n_pri > 0) or (T14_sec < 2.5 and n_sec > 0))'
nobs_eb = [len(ebpop.observe(new_orbits=True, regr_trap=True).query(dfm_q)) for i in range(n_sim)]
nobs_bg = [len(bgpop.observe(new_orbits=True, regr_trap=True).query(dfm_q)) for i in range(n_sim)]

In [26]:
plt.figure(figsize=(8,6))
plt.hist(nobs_eb, histtype='step', lw=3, label='EB', bins=10)
plt.hist(nobs_bg, histtype='step', lw=3, label='BEB', bins=3)
plt.legend();
plt.xlim(xmin=-0.3)
plt.yticks([])
plt.ylabel('Normalized frequency')
plt.xlabel('Number of observed single/double eclipses');
plt.savefig('N_EBs.pdf', bbox_inches='tight')
(np.mean(nobs_eb), np.std(nobs_eb), (np.mean(nobs_bg), np.std(nobs_bg)))


Out[26]:
(7.2300000000000004,
 2.5054141374231924,
 (0.40999999999999998, 0.58472215624174884))

Simulate detected population of planets

As a good sanity check, let's make sure we reproduce the detected population of planet signals, given the population model constrained in the paper.


In [10]:
# Here, defining population to match what was fit for in the paper
from exosyspop.populations import PlanetPopulation

import astropy.constants as const
RJUP = const.R_jup.cgs
RSUN = const.R_sun.cgs
REARTH = const.R_earth.cgs

class LongPeriodPlanetPopulation(PlanetPopulation):
    """
    Poisson Population in two bins: 0.1-0.4 RJup and 0.4-1.0 RJup
    
    From 2 to 25 years, with beta eccentricity distribution
    """
    min_period = 2*365.25
    max_period = 25*365.25
    
    vol1 = (np.log(0.4) - np.log(0.1))*(np.log(max_period) - np.log(min_period))
    vol2 = (np.log(1) - np.log(0.4))*(np.log(max_period) - np.log(min_period))
    
    param_names = ('Gam1', 'Gam2', 'beta_a', 'beta_b')
    default_params = {'Gam1':0.4, 'Gam2':0.18,
                      'beta_a':1.12, 'beta_b':3.09}
    
    def _sample_period(self, N):
        lo = np.log(self.min_period)
        hi = np.log(self.max_period)
        logp = np.random.random(N)*(hi - lo) + lo
        return np.exp(logp)
    
    def _sample_Np(self, N):
        Gam1, Gam2 = self._get_params(['Gam1', 'Gam2'])
        
        N_pl = Gam1*self.vol1 + Gam2*self.vol2 # Sum of both bins is total rate
        return np.random.poisson(N_pl, size=N)
    
    def _sample_Rp(self, N):
        Gam1, Gam2 = self._get_params(['Gam1', 'Gam2'])
        Nexp1 = Gam1*self.vol1
        Nexp2 = Gam2*self.vol2
        
        # Assign radii uniformly in each bin, with appropriate ratio given to each bin
        f1 = Nexp1 / (Nexp1 + Nexp2)
        u = np.random.random(N)
        in_1 = u < f1
        in_2 = u >= f1
        N1 = in_1.sum()
        N2 = in_2.sum()
        
        Rp = np.ones(N)
        lo1, hi1 = np.log(0.1), np.log(0.4)
        lo2, hi2 = np.log(0.4), np.log(1)
        Rp[in_1] = np.exp(np.random.random(N1)*(hi1 - lo1) + lo1)
        Rp[in_2] = np.exp(np.random.random(N2)*(hi2 - lo2) + lo2)
        return Rp * RJUP/RSUN

In [11]:
plpop = LongPeriodPlanetPopulation(targets)

plpop._train_trap(N=1000);
#_ = plpop.observe() # There was a point where this was necessary to avoid a bug...


Planet: Depth trained: R2=0.999350816606
Planet: Duration trained: R2=0.998843932513
Planet: Slope trained: R2=0.978945299494

OK, now let's load up the candidates, so we'll have something to compare to.


In [12]:
candidates = pd.read_csv('../results/fits.csv', index_col=0)

# add a few columns of interest
candidates['s2n'] = candidates.depth*1000/plpop.get_noise(candidates.index, candidates.duration) * np.sqrt(candidates.n_transits)
candidates['rstar'] = targets.ix[candidates.index, 'radius']

# convert depth from ppt to relative 
for p in ['depth', 'depth_uncert_minus', 'depth_uncert_plus']:
    candidates[p] *= 1e-3

# convert from hours to days
for p in ['duration', 'duration_uncert_minus', 'duration_uncert_plus']:
    candidates[p] /= 24. 

candidates[['depth','duration','period','radius','rstar','impact','n_transits','s2n']]


Out[12]:
depth duration period radius rstar impact n_transits s2n
kicid
11709124 0.008777 0.739623 4.264997 0.828526 0.971 0.512708 1 103.742952
9306307 0.060297 0.354141 4.252540 1.220102 0.915 0.639900 1 424.324242
10321319 0.000374 0.701656 5.522778 0.163172 0.941 0.473659 1 15.994137
3239945 0.018865 0.675076 2.932872 0.875794 0.710 0.206589 2 402.787682
8410697 0.005724 0.823828 2.868810 0.698399 1.003 0.153588 2 148.615011
10287723 0.001458 0.395524 4.878759 0.265570 0.726 0.680744 1 26.992640
8426957 0.005522 1.640239 54.248522 1.039960 1.080 0.889371 1 127.168467
8738735 0.001237 1.143376 9.860707 0.354537 1.044 0.283580 1 19.297674
6551440 0.001072 0.452081 4.020863 0.281950 1.096 0.604324 1 20.423233
10602068 0.087133 0.533501 3.159743 2.001200 0.910 0.602741 1 1097.418488
10187159 0.006272 0.491928 4.888803 0.434498 0.910 0.231216 1 58.340024
8800954 0.002967 0.656744 1.927996 0.386359 0.760 0.181905 2 139.611249
3218908 0.005095 0.893745 6.962198 0.514202 0.753 0.239244 1 66.403938
4754460 0.002888 0.663353 5.889479 0.674383 1.131 0.893378 1 28.051786
10842718 0.005119 1.496501 12.689679 0.742417 1.038 0.257651 1 48.734070
8505215 0.001820 0.835830 9.107808 0.277132 0.707 0.283046 1 48.878405

In [13]:
# Check out the period and radius distributions of the planets in this population

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,4))
ax1.hist(plpop.period, histtype='step', lw=3);
ax1.set_yticks([])
ax2.hist(plpop.radius_B*RSUN/RJUP, histtype='step', lw=3);
ax2.set_yticks([]);



In [14]:
# Emulate detection criteria: fewer than three transits, SNR > 15, duration < 2.5d
dfm_q_pl = 'n_pri < 3 and snr_pri > 15 and T14_pri < 2.5'
dfm_q_pl_nosnr = 'n_pri < 3 and T14_pri < 2.5 and snr_pri > 0'

nobs_pl = [len(plpop.observe(new_orbits=True, regr_trap=True).query(dfm_q_pl)) for i in range(100)]
nobs_pl2 = [len(plpop.observe(new_orbits=True, regr_trap=True).query(dfm_q_pl_nosnr)) for i in range(100)]

plt.hist(nobs_pl, histtype='step', lw=3, label='SNR $>$ 15');
plt.hist(nobs_pl2, histtype='step', lw=3, color='k', ls=':', label='all')
plt.legend(loc='upper right');



In [15]:
# This matches up pretty well with what was actually detected
np.mean(nobs_pl), len(candidates.query('radius < 1.0'))


Out[15]:
(16.579999999999998, 13)

In [16]:
# Of course, this isn't an exact match because the detection/veto criteria
#  are much more complicated than SNR > 15...

# Simulate a large population of observed single/double transits
N = 1000
pl_obs = plpop.get_N_observed(query=dfm_q_pl, N=N)

plt.hist(np.log10(pl_obs.snr_pri), normed=True);
plt.hist(np.log10(candidates.s2n.values), histtype='step', color='r', lw=3, normed=True);


Let's put all the number-of-detections simulations together on the same plot:


In [17]:
plt.figure(figsize=(8,6))
plt.hist(nobs_eb, histtype='step', lw=3, label='EBs', bins=10)#, bins=np.arange(16));
plt.hist(nobs_bg, histtype='step', lw=3, label='BEBs', bins=3)#, bins=np.arange(16));
plt.hist(nobs_pl, histtype='step', lw=3, label='Planets')
plt.legend();
plt.xlim(xmin=-0.3)
plt.yticks([])
plt.ylabel('Normalized frequency')
plt.xlabel('Number of observed single/double eclipses');
plt.savefig('N_predicted.pdf', bbox_inches='tight')


Shape comparison

Now that we know the typical numbers of signals to expect, let's look at their typical shapes, and compare with the observed signals.


In [18]:
def plot_kde(kde, ax=None, dur_range=(0,2.), depth_range=(-3.5,-0.5), 
            cands=candidates, **kwargs):
    if ax is None:
        fig, ax = plt.subplots(1,1)
        
    dur_grid = np.arange(dur_range[0], dur_range[1], 0.025)
    depth_grid = np.arange(depth_range[0], depth_range[1], 0.025)
    xx, yy = np.meshgrid(dur_grid, depth_grid)
    f = kde(np.vstack([xx.ravel(),yy.ravel()])).reshape(xx.shape)
    ax.imshow(f, origin='lower', extent=[dur_range[0], dur_range[1],
                                         depth_range[0], depth_range[1]], 
               aspect='auto', cmap='binary', interpolation='nearest',
               **kwargs)
    ax.set_xlabel('Duration [days]')
    ax.set_ylabel('log$_{10}$(depth)')
    
    if cands is not None:
        xval = cands.duration
        xerr = [cands.duration_uncert_minus, cands.duration_uncert_plus]

        yval = np.log10(cands.depth)
        yerr = [yval - np.log10(cands.depth - cands.depth_uncert_minus),
                np.log10(cands.depth + cands.depth_uncert_plus) - yval]
        
        ax.plot(xval, yval, 'w.', ms=14)
        ax.plot(xval, yval, 'g.', ms=10)
        #ax.errorbar(xval, yval, xerr=xerr, yerr=yerr,
        #            color='g', marker='.', ms=10, ls='none', capsize=0)

In [19]:
# First, simulate large populations of EBs, BEBs
eb_obs = ebpop.get_N_observed(query=dfm_q, N=2000, new_orbits=True)
bg_obs = bgpop.get_N_observed(query=dfm_q, N=200, use_ic=True, new_orbits=True) #takes longer, so use fewer

In [20]:
from scipy.stats import gaussian_kde

eb_kde = gaussian_kde(np.array([eb_obs.observed.dur_pri, eb_obs.observed.logd_pri]))
bg_kde = gaussian_kde(np.array([bg_obs.observed.dur_pri, bg_obs.observed.logd_pri]))
pl_kde = gaussian_kde(np.array([pl_obs.observed.dur_pri, pl_obs.observed.logd_pri]))

In [21]:
fig, axes = plt.subplots(1,3, figsize=(12,4), sharey=True)
plot_kde(eb_kde, ax=axes[0])
plot_kde(bg_kde, ax=axes[1])
plot_kde(pl_kde, ax=axes[2])
axes[1].set_ylabel('')
axes[2].set_ylabel('')
plt.subplots_adjust(wspace=0)
for ax,title in zip(axes, ['EB','BEB','Planet']):
    ax.set_yticks(np.arange(-3,-0.5,0.5))
    ax.annotate(title, xy=(0.95,0.1), xycoords='axes fraction', fontsize=20, ha='right')
    
plt.savefig('shape_kdes.pdf', bbox_inches='tight')


False positive probabilities.

OK, now we can guess the nature of each individual signal, given our predicted rates and the depth-duration shape information.


In [44]:
pts = [candidates.duration, np.log10(candidates.depth)]
L_eb = eb_kde(pts)
L_bg = bg_kde(pts)
L_pl = pl_kde(pts)

pi_eb = 7.2
pi_bg = 0.41
pi_pl = 12 # for example.

norm = L_eb*pi_eb + L_bg*pi_bg + L_pl*pi_pl

candidates['pr_eb'] = L_eb*pi_eb / norm
candidates['pr_bg'] = L_bg*pi_bg / norm
candidates['pr_pl'] = L_pl*pi_pl / norm

In [49]:
candidates[['pr_eb', 'pr_bg', 'pr_pl']].sum()


Out[49]:
pr_eb     2.726062
pr_bg     0.332295
pr_pl    12.941642
dtype: float64

In [46]:
candidates[['radius','depth', 'duration', 'pr_eb', 'pr_bg', 'pr_pl']]


Out[46]:
radius depth duration pr_eb pr_bg pr_pl
kicid
11709124 0.828526 0.008777 0.739623 0.052284 0.010690 9.370254e-01
9306307 1.220102 0.060297 0.354141 0.995772 0.004219 8.714941e-06
10321319 0.163172 0.000374 0.701656 0.000205 0.000667 9.991277e-01
3239945 0.875794 0.018865 0.675076 0.250492 0.015676 7.338320e-01
8410697 0.698399 0.005724 0.823828 0.031286 0.015073 9.536416e-01
10287723 0.265570 0.001458 0.395524 0.031426 0.023016 9.455589e-01
8426957 1.039960 0.005522 1.640239 0.125445 0.075844 7.987111e-01
8738735 0.354537 0.001237 1.143376 0.003223 0.026579 9.701981e-01
6551440 0.281950 0.001072 0.452081 0.012068 0.014184 9.737478e-01
10602068 2.001200 0.087133 0.533501 0.997658 0.002342 3.893206e-09
10187159 0.434498 0.006272 0.491928 0.072767 0.016390 9.108432e-01
8800954 0.386359 0.002967 0.656744 0.022953 0.023078 9.539691e-01
3218908 0.514202 0.005095 0.893745 0.027878 0.016680 9.554429e-01
4754460 0.674383 0.002888 0.663353 0.022085 0.023038 9.548769e-01
10842718 0.742417 0.005119 1.496501 0.061135 0.039662 8.992025e-01
8505215 0.277132 0.001820 0.835830 0.019386 0.025157 9.554563e-01

In [47]:
candidates[['pr_eb', 'pr_bg', 'pr_pl']].to_csv('../results/fpp.csv')

In [43]:
pi_eb = 2.21# 6.5
pi_bg = 0.22# 0.33
pi_pl = 13.57# 12 # for example.

norm = L_eb*pi_eb + L_bg*pi_bg + L_pl*pi_pl

candidates['pr_eb'] = L_eb*pi_eb / norm
candidates['pr_bg'] = L_bg*pi_bg / norm
candidates['pr_pl'] = L_pl*pi_pl / norm

candidates[['pr_eb', 'pr_bg', 'pr_pl']].sum()


Out[43]:
pr_eb     2.213045
pr_bg     0.177410
pr_pl    13.609545
dtype: float64

In [ ]: