Let's generate a catalog of planets, including binary stars. There are parameters controlling both the planet population and the binary star population:
$$ \theta = (F, \alpha, \beta, f_B, \gamma) $$$F$, $\alpha$ and $\beta$ are the power-law parameters of the true planet population, $f_B$ is the binary fraction, and $\gamma$ is the mass-ratio power law index. We want to generate a mock catalog according these parameters, with the Kepler stars as the starting point.
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.pyplot as pl
import logging
rootLogger = logging.getLogger()
rootLogger.setLevel(logging.INFO)
In [2]:
# stlr has cuts built in here
from utils import get_catalog, stlr
In [3]:
from isochrones.dartmouth import Dartmouth_Isochrone
dar = Dartmouth_Isochrone()
In [4]:
from __future__ import print_function, division
from scipy.stats import poisson, powerlaw
import pandas as pd
import numpy as np
P_RANGE = (50, 300)
R_RANGE = (0.75, 20)
period_rng = P_RANGE
rp_rng = R_RANGE
def draw_powerlaw(alpha, rng):
if alpha == -1:
alpha = -1.0000001
# Normalization factor
x0, x1 = rng
C = (alpha + 1) / (x1**(alpha + 1) - x0**(alpha + 1))
u = np.random.random()
return ((u * (alpha + 1)) / C + x0**(alpha + 1))**(1./(alpha + 1))
def draw_planet(theta):
"""
Returns radius and period for a planet, given parameters
"""
_, alpha, beta, _, _ = theta
return draw_powerlaw(alpha, R_RANGE), draw_powerlaw(beta, P_RANGE)
def get_companion(theta, star, ic=dar, band='Kepler'):
_, _, _, fB, gamma = theta
# Is there a binary? If not, just return radius
if np.random.random() > fB:
return star, 0.
# Draw the mass of the secondary
M1 = star.mass
qmin = dar.minmass / M1
q = draw_powerlaw(gamma, (qmin, 1))
M2 = q*M1
# Now we need more precise stellar properties
minage, maxage = ic.agerange(M1, star.feh)
maxage = min(maxage, ic.maxage)
minage = max(minage, ic.minage)
minage += 0.05
maxage -= 0.05
if maxage < minage:
raise ValueError('Cannot simulate this: maxage < minage!')
age = np.random.random() * (maxage - minage) + minage
R1 = star.radius # This is WRONG
R2 = ic.radius(M2, age, star.feh)
debug_str = '({}, {}, {}, {}, agerange: ({}, {}))'.format(M1, M2, age, star.feh,
minage, maxage)
if np.isnan(R2):
raise ValueError('R2 is NaN:',debug_str)
if R2 > R1:
R2 = R1 #HACK alert!
dmag = ic.mag[band](M2, age, star.feh) - ic.mag[band](M1, age, star.feh)
flux_ratio = 10**(-0.4 * dmag)
if np.isnan(flux_ratio):
logging.warning('Flux_ratio is nan: {}'.format(debug_str))
newstar = star.copy()
newstar.mass = M2
newstar.radius = R2
return newstar, flux_ratio
def diluted_radius(radius, star, star2, flux_ratio):
"""
Returns diluted radius
"""
if flux_ratio==0:
return radius, star
# calculate radius correction factor;
# depends on which star planet is around
if np.random.random() < 0.5:
# around star 1
Xr = np.sqrt(1 + flux_ratio)
host_star = star.copy()
else:
# around star 2
Xr = star.radius / star2.radius * np.sqrt((1 + flux_ratio) / flux_ratio)
host_star = star2.copy()
return radius / Xr, host_star
#Get useful functions from Dan's code:
from utils import get_duration, get_a, get_delta
from utils import get_mes, get_pdet, get_pwin
from utils import get_pgeom, get_completeness
R_EARTH = 0.009171 #solar units
def generate_planets(theta, stars=stlr, mes_threshold=10):
"""
theta = (lnf0, alpha, beta, fB, gamma)
"""
lnf0, alpha, beta, fB, gamma = theta
planets = pd.DataFrame({'kepid':[], 'koi_prad':[], 'koi_period':[],
'koi_prad_true':[], 'koi_max_mult_ev':[]})
n_skipped = 0
for _, star in stars.iterrows():
if np.isnan(star.radius) or np.isnan(star.mass):
n_skipped += 1
continue
n_planets = poisson(np.exp(lnf0)).rvs()
if n_planets == 0:
continue
try:
star2, flux_ratio = get_companion(theta, star)
except ValueError:
n_skipped += 1
continue
#logging.warning('Skipping {}; cannot simulate binary.'.format(star.kepid))
for i in range(n_planets):
# First, figure out true & observed properties of planet
radius, period = draw_planet(theta)
observed_radius, host_star = diluted_radius(radius, star, star2, flux_ratio)
logging.debug('True: {:.2f}, Observed: {:.2f} ({})'.format(radius,
observed_radius,
flux_ratio))
# Then, is it detected?
# First, geometric:
aor = get_a(period, host_star.mass)
if np.isnan(aor):
raise RuntimeError('aor is nan: P={} M={}'.format(period, host_star.mass))
#print(host_star.mass, aor)
transit_prob = get_pgeom(aor / host_star.radius, 0.) # no ecc.
if np.random.random() > transit_prob:
continue
# Then depth and MES:
depth = get_delta(observed_radius * R_EARTH / star.radius)
tau = get_duration(period, aor, 0.) * 24 # no ecc.
try:
mes = get_mes(star, period, depth, tau)
except ValueError:
n_skipped += 1
#raise RuntimeError('MES is nan! {}, {}, {}'.format(depth, tau))
if mes < mes_threshold:
continue
# Add planet to catalog
planets = planets.append({'kepid':star.kepid,
'koi_prad':observed_radius,
'koi_period':period,
'koi_prad_true':radius,
'koi_max_mult_ev':mes}, ignore_index=True)
print('{} planets generated ({} of {} stars skipped.)'.format(len(planets),
n_skipped, len(stars)))
return planets
In [20]:
#Here's an example of the true population.
theta = [-0.3, -1.5, -0.8, 0.0, 0.3]
rps = [draw_planet(theta)[0] for i in range(500)]
plt.hist(rps, histtype='step');
In [21]:
#generate population with no binaries
theta = [-0.3, -1.5, -0.8, 0.0, 0.3]
df = generate_planets(theta, stlr)
df.to_hdf('synthetic_kois_single.h5','df')
In [22]:
#generate population with binaries
theta = [-0.3, -1.5, -0.8, 0.5, 0.3]
df = generate_planets(theta, stlr)
df.to_hdf('synthetic_kois_binaries.h5','df')
OK, let's try to apply dan's population inference model:
In [8]:
kois = pd.read_hdf('synthetic_kois_single.h5', 'df')
In [59]:
fig, (ax1, ax2) = pl.subplots(1,2, figsize=(8,4))
ax1.hist(kois.koi_prad_true, histtype='step');
ax2.hist(kois.koi_period, histtype='step');
Get "completeness" with standard assumptions. Need to tweak to use straight-up MES=10 threshold.
In [27]:
rp_rng
Out[27]:
In [5]:
period = np.linspace(period_rng[0],
period_rng[1], 45)
rp = np.linspace(rp_rng[0],
rp_rng[1], 101)
period_grid, rp_grid = np.meshgrid(period, rp, indexing="ij")
comp = np.zeros_like(period_grid)
for _, star in stlr.iterrows():
try:
comp += get_completeness(star, period_grid, rp_grid,
0.0, with_geom=True, thresh=10)
except ValueError:
continue
In [39]:
np.savez('completeness', comp=comp, period_grid=period_grid, rp_grid=rp_grid,
inds=stlr.index, thresh=10)
In [6]:
kois = pd.read_hdf('synthetic_kois_single.h5', 'df')
# A double power law model for the population.
def population_model(theta, period, rp):
lnf0, beta, alpha = theta
v = np.exp(lnf0) * np.ones_like(period)
for x, rng, n in zip((period, rp),
(period_rng, rp_rng),
(beta, alpha)):
n1 = n + 1
v *= x**n*n1 / (rng[1]**n1-rng[0]**n1)
return v
# The ln-likelihood function given at the top of this post.
koi_periods = np.array(kois.koi_period)
koi_rps = np.array(kois.koi_prad)
vol = np.diff(period_grid, axis=0)[:, :-1] * np.diff(rp_grid, axis=1)[:-1, :]
def lnlike(theta):
pop = population_model(theta, period_grid, rp_grid) * comp
pop = 0.5 * (pop[:-1, :-1] + pop[1:, 1:])
norm = np.sum(pop * vol)
ll = np.sum(np.log(population_model(theta, koi_periods, koi_rps))) - norm
return ll if np.isfinite(ll) else -np.inf
# The ln-probability function is just propotional to the ln-likelihood
# since we're assuming uniform priors.
bounds = [(-5, 5), (-5, 5), (-5, 5)]
def lnprob(theta):
# Broad uniform priors.
for t, rng in zip(theta, bounds):
if not rng[0] < t < rng[1]:
return -np.inf
return lnlike(theta)
# The negative ln-likelihood is useful for optimization.
# Optimizers want to *minimize* your function.
def nll(theta):
ll = lnlike(theta)
return -ll if np.isfinite(ll) else 1e15
In [7]:
from scipy.optimize import minimize
theta_0 = np.array([-0.3, -1.5, -0.8])
r = minimize(nll, theta_0, method="L-BFGS-B", bounds=bounds)
print(r)
In [8]:
%matplotlib inline
import matplotlib.pyplot as pl
# We'll reuse these functions to plot all of our results.
def make_plot(pop_comp, x0, x, y, ax):
pop = 0.5 * (pop_comp[:, 1:] + pop_comp[:, :-1])
pop = np.sum(pop * np.diff(y)[None, :, None], axis=1)
a, b, c, d, e = np.percentile(pop * np.diff(x)[0], [2.5, 16, 50, 84, 97.5], axis=0)
ax.fill_between(x0, a, e, color="k", alpha=0.1, edgecolor="none")
ax.fill_between(x0, b, d, color="k", alpha=0.3, edgecolor="none")
ax.plot(x0, c, "k", lw=1)
def plot_results(samples):
# Loop through the samples and compute the list of population models.
samples = np.atleast_2d(samples)
pop = np.empty((len(samples), period_grid.shape[0], period_grid.shape[1]))
gamma_earth = np.empty((len(samples)))
for i, p in enumerate(samples):
pop[i] = population_model(p, period_grid, rp_grid)
gamma_earth[i] = population_model(p, 365.25, 1.0) * 365.
fig, axes = pl.subplots(2, 2, figsize=(10, 8))
fig.subplots_adjust(wspace=0.4, hspace=0.4)
# Integrate over period.
dx = 0.25
x = np.arange(rp_rng[0], rp_rng[1] + dx, dx)
n, _ = np.histogram(koi_rps, x)
# Plot the observed radius distribution.
ax = axes[0, 0]
make_plot(pop * comp[None, :, :], rp, x, period, ax)
ax.errorbar(0.5*(x[:-1]+x[1:]), n, yerr=np.sqrt(n), fmt=".k",
capsize=0)
ax.set_xlim(rp_rng[0], rp_rng[1])
ax.set_xlabel("$R_p\,[R_\oplus]$")
ax.set_ylabel("\# of detected planets")
# Plot the true radius distribution.
ax = axes[0, 1]
make_plot(pop, rp, x, period, ax)
ax.set_xlim(rp_rng[0], rp_rng[1])
ax.set_ylim(0, 0.37)
ax.set_xlabel("$R_p\,[R_\oplus]$")
ax.set_ylabel("$\mathrm{d}N / \mathrm{d}R$; $\Delta R = 0.25\,R_\oplus$")
# Integrate over period.
dx = 31.25
x = np.arange(period_rng[0], period_rng[1] + dx, dx)
n, _ = np.histogram(koi_periods, x)
# Plot the observed period distribution.
ax = axes[1, 0]
make_plot(np.swapaxes(pop * comp[None, :, :], 1, 2), period, x, rp, ax)
ax.errorbar(0.5*(x[:-1]+x[1:]), n, yerr=np.sqrt(n), fmt=".k",
capsize=0)
ax.set_xlim(period_rng[0], period_rng[1])
ax.set_ylim(0, 79)
ax.set_xlabel("$P\,[\mathrm{days}]$")
ax.set_ylabel("\# of detected planets")
# Plot the true period distribution.
ax = axes[1, 1]
make_plot(np.swapaxes(pop, 1, 2), period, x, rp, ax)
ax.set_xlim(period_rng[0], period_rng[1])
ax.set_ylim(0, 0.27)
ax.set_xlabel("$P\,[\mathrm{days}]$")
ax.set_ylabel("$\mathrm{d}N / \mathrm{d}P$; $\Delta P = 31.25\,\mathrm{days}$")
return gamma_earth
print(plot_results(r.x));
In [9]:
import emcee
ndim, nwalkers = len(r.x), 16
pos = [r.x + 1e-5 * np.random.randn(ndim) for i in range(nwalkers)]
sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob)
# Burn in.
pos, _, _ = sampler.run_mcmc(pos, 1000)
sampler.reset()
# Production.
pos, _, _ = sampler.run_mcmc(pos, 4000)
import triangle
triangle.corner(sampler.flatchain, labels=[r"$\ln F$", r"$\beta$", r"$\alpha$"],
truths=[-0.3, -0.8, -1.5]);
In [ ]:
#gamma_earth = plot_results(sampler.flatchain) #why does this freak out?
OK, now for binaries!
In [10]:
kois = pd.read_hdf('synthetic_kois_binaries.h5', 'df')
# A double power law model for the population.
def population_model(theta, period, rp):
lnf0, beta, alpha = theta
v = np.exp(lnf0) * np.ones_like(period)
for x, rng, n in zip((period, rp),
(period_rng, rp_rng),
(beta, alpha)):
n1 = n + 1
v *= x**n*n1 / (rng[1]**n1-rng[0]**n1)
return v
# The ln-likelihood function given at the top of this post.
koi_periods = np.array(kois.koi_period)
koi_rps = np.array(kois.koi_prad)
vol = np.diff(period_grid, axis=0)[:, :-1] * np.diff(rp_grid, axis=1)[:-1, :]
def lnlike(theta):
pop = population_model(theta, period_grid, rp_grid) * comp
pop = 0.5 * (pop[:-1, :-1] + pop[1:, 1:])
norm = np.sum(pop * vol)
ll = np.sum(np.log(population_model(theta, koi_periods, koi_rps))) - norm
return ll if np.isfinite(ll) else -np.inf
# The ln-probability function is just propotional to the ln-likelihood
# since we're assuming uniform priors.
bounds = [(-5, 5), (-5, 5), (-5, 5)]
def lnprob(theta):
# Broad uniform priors.
for t, rng in zip(theta, bounds):
if not rng[0] < t < rng[1]:
return -np.inf
return lnlike(theta)
# The negative ln-likelihood is useful for optimization.
# Optimizers want to *minimize* your function.
def nll(theta):
ll = lnlike(theta)
return -ll if np.isfinite(ll) else 1e15
In [ ]:
import emcee
ndim, nwalkers = len(r.x), 16
pos = [r.x + 1e-5 * np.random.randn(ndim) for i in range(nwalkers)]
sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob)
# Burn in.
pos, _, _ = sampler.run_mcmc(pos, 1000)
sampler.reset()
# Production.
pos, _, _ = sampler.run_mcmc(pos, 4000)
In [13]:
import triangle
triangle.corner(sampler.flatchain, labels=[r"$\ln F$", r"$\beta$", r"$\alpha$"],
truths=[-0.3, -0.8, -1.5]);
In [14]:
np.exp(-0.6)/np.exp(-0.3)
Out[14]:
In [15]:
kois.head()
Out[15]:
In [18]:
(kois.koi_prad_true / kois.koi_prad).mean()
Out[18]:
In [29]:
theta = [-0.3, -1.5, -0.8, 0.5, 0.3]
In [30]:
pd.Series(theta).to_hdf('test.h5', 'theta')
In [31]:
theta = list(pd.read_hdf('test.h5', 'theta'))
In [32]:
theta
Out[32]:
In [36]:
import os.path
os.path.splitext('thiet/thie/thiets.df')[0]
Out[36]:
In [38]:
d.keys()
Out[38]:
In [ ]: