In [1]:
%matplotlib inline
from matplotlib import rcParams
rcParams["savefig.dpi"] = 100
In [28]:
from collections import Counter
import os
import time
import tqdm
import corner
import numpy as np
import pandas as pd
from functools import partial
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp, gaussian_kde
from sklearn.decomposition import PCA
In [11]:
from exoabc import Simulator, data
In [13]:
period_range = (50, 300)
prad_range = (0.75, 2.5)
depth_range = (0, 1000)
maxn = 5
In [15]:
prefix = "q1_q16"
stlr = data.get_burke_gk(prefix=prefix).iloc[:5000]
kois = data.get_candidates(stlr=stlr, prefix=prefix)
In [16]:
sim = Simulator(
stlr,
period_range[0], period_range[1], 0.0,
prad_range[0], prad_range[1], -2.0,
-3.0, np.zeros(maxn),
min_period_slope=-5.0, max_period_slope=3.0,
min_radius_slope=-5.0, max_radius_slope=3.0,
min_log_sigma=-5.0, max_log_sigma=np.log(np.radians(90)),
min_log_multi=-5.0, max_log_multi=3.0,
release=prefix,
seed=int(os.getpid() + 1000*time.time()) % 20000,
)
In [22]:
def compute_stats(catalog):
m = (period_range[0] <= catalog.koi_period) & (catalog.koi_period <= period_range[1])
m &= (prad_range[0] <= catalog.koi_prad) & (catalog.koi_prad <= prad_range[1])
m &= (depth_range[0] <= catalog.koi_depth) & (catalog.koi_depth <= depth_range[1])
c = catalog[m]
# Multiplicity
h = Counter(Counter(c.kepid).values())
hist = np.zeros(maxn+1, dtype=int)
for i in range(1, maxn+1):
hist[i] = h.get(i, 0)
hist[0] = len(stlr) - np.sum(hist[1:])
return hist, np.array(c.koi_period), np.array(c.koi_depth), np.array(c.koi_duration)
def compute_distance(ds1, ds2):
norm = max(np.max(ds1[0]), np.max(ds2[0]))
multi_dist = np.sum((np.log(ds1[0]+1) - np.log(ds2[0]+1))**2.0)
period_dist = ks_2samp(ds1[1], ds2[1]).statistic
depth_dist = ks_2samp(ds1[2], ds2[2]).statistic
return multi_dist + period_dist + depth_dist
In [23]:
obs_stats = compute_stats(kois)
In [24]:
obs_stats
Out[24]:
In [47]:
def sample(initial=None):
if initial is None:
lp = sim.sample_parameters()
if not np.isfinite(lp):
return np.inf, sim.get_parameters(), sim.state
else:
lp = sim.set_parameters(initial)
if not np.isfinite(lp):
return np.inf, sim.get_parameters(), sim.state
pars, state = sim.get_parameters(), sim.state
df = sim.sample_population()
if len(df) <= 1:
return np.inf, pars, state
return compute_distance(obs_stats, compute_stats(df)), pars, state
In [97]:
n = 5000
initial_distances, inital_theta, _ = map(np.array, zip(*[sample() for _ in tqdm.tqdm(range(n), total=n)]))
In [99]:
m = np.isfinite(initial_distances)
In [104]:
q = np.percentile(initial_distances[m], 10)
print(q)
In [105]:
plt.hist(initial_distances[m], 50, color="k", histtype="step")
plt.gca().axvline(q, color="g", lw=1.5);
In [106]:
def mh(log_p_func, theta0, niter, sigma=(1e-6, 1e-1)):
ndim = len(theta0)
theta = np.array(theta0)
chain = np.empty((niter, ndim))
lp = log_p_func(theta0)
acc = 0
for i in tqdm.tqdm(range(niter), total=niter):
q = np.array(theta)
ind = np.random.randint(ndim)
q[ind] += np.exp(np.random.uniform(*(np.log(sigma)))) * np.random.randn()
lq = log_p_func(q)
u = np.log(np.random.rand())
if u < lq - lp:
theta = q
lp = lq
acc += 1
chain[i] = theta
return chain, acc / niter
In [107]:
def pseudo_likelihood(S, eps, params):
dist = np.array([sample(params)[0] for _ in range(S)])
return np.logaddexp.reduce(-0.5 * (dist / eps)**2)
In [123]:
pseudo_likelihood_func = partial(pseudo_likelihood, 5, 0.15)
In [124]:
p0 = inital_theta[np.argmin(initial_distances)]
samples, acc = mh(pseudo_likelihood_func, p0, 10000, sigma=(1e-2, 2.0))
print(acc)
In [125]:
plt.plot(samples[:, 4])
Out[125]:
In [126]:
corner.corner(samples);
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [343]:
n = 10
samples = [sample() for _ in tqdm.tqdm(range(n), total=n)]
In [344]:
samples
Out[344]:
In [345]:
distances = np.array([s[0] for s in samples])
m = np.all(np.isfinite(distances), axis=1)
distances = distances[m]
params = np.array([s[1] for s in samples])[m]
states = np.array([s[2] for s in samples])[m]
In [346]:
print(distances)
In [347]:
model = PCA(1)
model.fit(distances)
Out[347]:
In [348]:
model.components_
Out[348]:
In [349]:
w = np.abs(model.components_[0])
w /= np.sum(w)
scalar = np.dot(distances, w)
In [350]:
points = np.vstack([np.sum(distances, axis=1) / 3.0, scalar]).T
corner.corner(points, range=[(0, 1), (0, 1)], labels=["raw", "optimized"]);
In [351]:
inds = np.argsort(scalar)
corner.corner(params[inds[:500]]);
In [386]:
i = 1
# p = np.array(new_params[new_inds[i]])
p = np.array([-0.7357879, -1.11290583, 0.19843469, 6.54973016, 11.39524634])
print(p)
sim.set_parameters(p)
# sim.state = new_states[new_inds[i]]
pop = sim.sample_population()
sim_stats = compute_stats(pop)
sim_stats[0], obs_stats[0]
Out[386]:
In [387]:
plt.hist(obs_stats[1], range=period_range, histtype="step", color="k")
plt.hist(sim_stats[1], range=period_range, histtype="step", color="g");
In [388]:
plt.hist(obs_stats[2], range=depth_range, histtype="step", color="k")
plt.hist(sim_stats[2], range=depth_range, histtype="step", color="g");
In [178]:
kde = gaussian_kde(new_params[new_inds].T)
#kde.factor *= 2
In [179]:
new_n = 50000
new_x = kde.resample(new_n)
new_logpdfs = kde.logpdf(new_x)
In [180]:
new_samples = [sample(x) for x in tqdm.tqdm(new_x.T, total=new_n)]
In [190]:
new_distances = np.array([s[0] for s in new_samples])
m = np.all(np.isfinite(new_distances), axis=1)
new_distances = new_distances[m]
new_params = np.array([s[1] for s in new_samples])[m]
new_states = np.array([s[2] for s in new_samples])[m]
new_model = PCA(1)
new_model.fit(new_distances)
new_w = np.abs(new_model.components_[0])
new_w /= np.sum(new_w)
new_scalar = np.dot(new_distances, new_w)
points = np.vstack([np.sum(new_distances, axis=1) / 3.0, new_scalar]).T
corner.corner(points, range=[(0, 1), (0, 1)], labels=["raw", "optimized"]);
new_inds = np.argsort(new_scalar)[:2000]
weights = new_logpdfs[new_inds]
weights = np.exp(np.min(weights) - weights)
# weights = None
corner.corner(new_params[new_inds], weights=weights);
In [191]:
plt.hist(new_logpdfs, 50); # - np.max(new_logpdfs)
In [192]:
plt.plot(new_x[2], new_x[3], ".k")
x = new_params[new_inds]
plt.plot(x[:, 2], x[:, 3], ".g")
Out[192]:
In [205]:
new_model = PCA(1)
new_model.fit(new_distances)
w = np.abs(new_model.components_[0])
print(w / np.sum(w))
_, ev = np.linalg.eig(np.cov(new_distances, rowvar=False))
w = np.abs(ev[:, 0])
print(w / np.sum(w))
In [206]:
np.linalg.svd(np.cov(new_distances, rowvar=False))
Out[206]:
In [207]:
ev
Out[207]:
In [ ]:
In [ ]:
In [ ]:
In [216]:
def fit_distances(distances):
_, ev = np.linalg.eig(np.cov(distances, rowvar=False))
w = np.abs(ev[:, 0])
w /= np.sum(w)
print(w)
return np.dot(distances, w)
In [217]:
fit_distances(new_distances).max()
Out[217]:
In [222]:
# iteration 1
N = 1000
theta = list(map(sample, tqdm.tqdm((None for _ in range(N)), total=N)))
distances = np.array([s[0] for s in samples])
m = np.all(np.isfinite(distances), axis=1)
distances = distances[m]
params = np.array([s[1] for s in samples])[m]
states = np.array([s[2] for s in samples])[m]
rho = np.sum(distances, axis=1)
In [223]:
rho
Out[223]:
In [252]:
tau = np.sqrt(2*np.var(params, axis=0))
eps = np.percentile(rho, 50)
weights = np.ones(len(params)) / len(params)
In [290]:
def pmc_sample_one(eps, theta0, weights, tau):
rho = np.inf
while rho > eps or not np.isfinite(rho):
theta_star = theta0[np.random.choice(np.arange(len(weights)), p=weights)]
theta_i = theta_star + tau * np.random.randn(len(theta_star))
p, _, state = sample(theta_i)
rho = np.sum(p)
log_prior = sim.log_pdf()
log_weight = np.log(weights) - np.sum(0.5*((theta0 - theta_i)/tau[None, :])**2 + np.log(tau[None, :]), axis=1)
log_weight = log_prior - np.logaddexp.reduce(log_weight)
return theta_i, rho, log_prior, log_weight
In [292]:
new_samples = [pmc_sample_one(eps, params, weights, tau) for _ in tqdm.tqdm(range(N), total=N)]
In [296]:
rho = np.array([s[1] for s in new_samples])
m = np.isfinite(rho)
rho = rho[m]
params = np.array([s[0] for s in new_samples])[m]
log_weight = np.array([s[3] for s in new_samples])[m]
In [297]:
plt.hist(np.exp(log_weight))
Out[297]:
In [326]:
norm = np.sum(weights)
mu = np.sum(params * weights[:, None], axis=0) / norm
np.sqrt(2 * np.sum((params - mu)**2 * weights[:, None], axis=0) / norm), tau
Out[326]:
In [318]:
tau = np.sqrt(2*np.var(params, axis=0))
eps = np.percentile(rho, 50)
weights = np.exp(log_weight - np.logaddexp.reduce(log_weight))
In [319]:
new_samples_2 = [pmc_sample_one(eps, params, weights, tau) for _ in tqdm.tqdm(range(N), total=N)]
In [320]:
rho = np.array([s[1] for s in new_samples_2])
m = np.isfinite(rho)
rho = rho[m]
params = np.array([s[0] for s in new_samples_2])[m]
log_weights = np.array([s[3] for s in new_samples_2])[m]
In [321]:
corner.corner(params, weights=np.exp(log_weight));
In [322]:
plt.hist(rho, 50, histtype="step", color="k")
plt.gca().axvline(eps)
plt.gca().axvline(np.percentile(rho, 50));
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [373]:
def scale_random(pars, u):
n = len(pars)
logu = np.log(u)
norm = 0.0 #-np.inf
value = 0.0 #-np.inf
for i in range(n):
norm = np.logaddexp(norm, pars[i])
for i in range(n):
value = np.logaddexp(value, pars[i])
if (value - norm > logu):
return i
return n-1
In [379]:
x = [scale_random([0.0, 6.54973016, 11.39524634, 5.84320176], np.random.rand()) for _ in range(5000)]
plt.hist(x, 4)
Out[379]:
In [377]:
x
Out[377]:
In [392]:
import h5py
with h5py.File("blah.h5", "w") as f:
f.create_dataset("blah", data=states)
In [ ]: