In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rc
from scipy.stats import norm, lognorm
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
rc('text', usetex=True)
sns.set_style("whitegrid")
# we will work with datasets of 2000 samples
n = 2000
tmp = pd.read_csv('mock_data.csv')
mass = tmp.mass[0 : n].as_matrix()
z = tmp.z[0 : n].as_matrix()
datasets = []
for seed in xrange(9):
alpha1 = norm(10.709, 0.022).rvs()
alpha2 = norm(0.359, 0.009).rvs()
alpha3 = 2.35e14
alpha4 = norm(1.10, 0.06).rvs()
S = norm(0.155, 0.0009).rvs()
sigma_L = 0.05
mu_li = np.exp(alpha1) * ((mass / alpha3) ** (alpha2))* ((1+z) ** (alpha4))
li = lognorm(S, scale=mu_li).rvs()
observed = lognorm(sigma_L, scale=li).rvs()
d = {
'seed': seed,
'alpha1': alpha1,
'alpha2': alpha2,
'alpha3': alpha3,
'alpha4': alpha4,
'S': S,
'sigma_L': sigma_L,
'lum_mu': mu_li,
'lum': li,
'lum_obs': observed,
'mass': mass,
'z': z,
}
datasets.append(d)
In [2]:
def log10(arr):
return np.log(arr) / np.log(10)
f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
d = datasets[i]
sns.kdeplot(log10(d['mass']), log10(d['lum']), shade=True, cmap='Blues', ax=ax)
ax.set_xlabel('Mass ($\log_{10} M_{\odot}$)')
ax.set_ylabel('Luminosity ($\log_{10} L_{\odot} / h^2$)')
ax.set_title('Seed %d' % i)
f.tight_layout()
In [4]:
mus = []
stds = []
for i in xrange(9):
d = datasets[i]
mus.append(np.mean(log10(d['lum'])))
stds.append(np.std(log10(d['lum'])))
plt.scatter(mus, stds)
plt.xlabel('Mean Luminosity ($\log_{10} L_{\odot} / h^2$)')
plt.ylabel('Stdev Luminosity ($\log_{10} L_{\odot} / h^2$)')
plt.title('Comparing Different Luminosity Distributions');
In [5]:
for dataset in datasets:
dataset['inv_lum'] = lognorm(dataset['sigma_L'], scale=dataset['lum_obs']).rvs()
In [6]:
f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
d = datasets[i]
ax.scatter(log10(d['mass']), log10(d['lum']), color='blue', alpha=0.2, label='true')
ax.scatter(log10(d['mass']), log10(d['inv_lum']), color='red', alpha=0.2, label='inverted')
ax.set_xlabel('Mass ($\log_{10} M_{\odot}$)')
ax.set_ylabel('Luminosity ($\log_{10} L_{\odot} / h^2$)')
ax.set_title('Seed %d' % i)
ax.legend()
f.tight_layout()
In [7]:
for d in datasets:
d['inv_mass_mu'] = d['alpha3'] * (d['inv_lum'] / (np.exp(d['alpha1']) * (1 + d['z']) ** d['alpha4'])) ** (1 / d['alpha2'])
d['inv_mass'] = lognorm(d['S'], scale=d['inv_mass_mu']).rvs()
In [8]:
f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
d = datasets[i]
ax.scatter(log10(d['mass']), log10(d['mass']), color='blue', alpha=0.2)
ax.scatter(log10(d['mass']), log10(d['inv_mass']), color='red', alpha=0.2)
ax.set_xlabel('Mass ($\log_{10} M_{\odot}$)')
ax.set_ylabel('Mass ($\log_{10} M_{\odot}$)')
ax.set_title('Seed %d' % i)
f.tight_layout()
Now we fix seed0 and compare against to see how sensitive our inversions are to the drawn hyperparameters.
In [10]:
for d in datasets:
fixed_seed = 0
alpha1 = datasets[fixed_seed]['alpha1']
alpha2 = datasets[fixed_seed]['alpha2']
alpha3 = datasets[fixed_seed]['alpha3']
alpha4 = datasets[fixed_seed]['alpha4']
S = datasets[fixed_seed]['S']
d['seed_inv_mass_mu'] = alpha3 * (d['inv_lum'] / (np.exp(alpha1) * (1 + d['z']) ** alpha4)) ** (1 / alpha2)
d['seed_inv_mass'] = lognorm(5.6578015811698101 * S, scale=d['seed_inv_mass_mu']).rvs()
f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
d = datasets[i]
ax.scatter(log10(d['mass']), log10(d['mass']), color='blue', alpha=0.2)
ax.scatter(log10(d['mass']), log10(d['inv_mass']), color='red', alpha=0.2)
ax.scatter(log10(d['mass']), log10(d['seed_inv_mass']), color='gold', alpha=0.2)
ax.set_xlabel('Mass ($\log_{10} M_{\odot}$)')
ax.set_ylabel('Inverted Mass ($\log_{10} M_{\odot}$)')
ax.set_title('Seed %d' % i)
f.tight_layout()
In [11]:
f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
d = datasets[i]
ax.scatter(log10(d['mass']), log10(d['lum']), color='blue', alpha=0.2)
ax.scatter(log10(d['seed_inv_mass']), log10(d['inv_lum']), color='red', alpha=0.2)
ax.set_xlabel('Mass ($\log_{10} M_{\odot}$)')
ax.set_ylabel('Luminosity ($\log_{10} L_{\odot} / h^2$)')
ax.set_title('Seed %d' % i)
f.tight_layout()
In [13]:
import hmf
import scipy.interpolate as interpolate
class MassPrior():
"""
Class to represent mass prior distribution.
Initialize with mass and probability arrays.
Can both evaluate and sample from distribution.
"""
def __init__(self, mass, prob):
self.mass = mass
self.prob = prob
self.min = mass.min()
self.max = mass.max()
# have to add 0,1 samples for interpolation bounds
cumsum = np.append(np.append(np.array([0]), np.cumsum(prob)), np.array([1]))
masses = np.append(np.append(np.array([self.min-1]), self.mass), np.array([self.max+1]))
self.inv_cdf = interpolate.interp1d(cumsum, masses)
def pdf(self, mass):
ret = self._pdf(mass)
if (len(mass) > 1):
ret = np.prod(ret)
return ret
def logpdf(self, mass):
ret = np.log(self._pdf(mass))
if (len(mass) > 1):
ret = np.sum(ret)
return ret
def _pdf(self, mass):
if np.any(mass < self.min) or np.any(mass > self.max):
raise Exception('out of range')
right_ind = np.searchsorted(self.mass, mass)
left_ind = right_ind - 1
# find where we fall in interval between masses
f = (mass - self.mass[left_ind]) / (self.mass[right_ind] - self.mass[left_ind])
return f * self.prob[right_ind] + (1-f) * self.prob[left_ind]
def rvs(self, size=1):
return self.inv_cdf(np.random.rand(size))
@staticmethod
def default():
h = 0.73
Mmin = 10.2358590918
Mmax = 14.3277327776
mf = hmf.MassFunction(Mmin=Mmin, Mmax=Mmax, cosmo_model=hmf.cosmo.WMAP5, hmf_model=hmf.fitting_functions.Tinker10)
return MassPrior(mf.m*h, mf.dndm / sum(mf.dndm)) # the h normalization is important
mass_prior = MassPrior.default()
samplings = []
for i in xrange(9):
np.random.seed(i)
fixed_seed = 0
alpha1 = datasets[fixed_seed]['alpha1']
alpha2 = datasets[fixed_seed]['alpha2']
alpha3 = datasets[fixed_seed]['alpha3']
alpha4 = datasets[fixed_seed]['alpha4']
S = datasets[fixed_seed]['S']
s = dict()
s['mass'] = mass_prior.rvs(size=n)
s['lum_mu'] = np.exp(alpha1) * ((s['mass'] / alpha3) ** (alpha2))* ((1+z) ** (alpha4))
s['lum'] = lognorm(S, scale=s['lum_mu']).rvs()
s['seed'] = i
samplings.append(s)
In [30]:
import matplotlib.patches as mpatches
f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True)
handles = [mpatches.Patch(color='blue', label='Data'), \
mpatches.Patch(color='red', label='Importance Sampling'), \
mpatches.Patch(color='yellow', label='Simple Monte Carlo')]
plt.legend(handles=handles,bbox_to_anchor=(-0.5, 4.0))
for i, ax in enumerate(axes.flat):
d = datasets[i]
s = samplings[i]
ax.scatter(log10(d['mass']), log10(d['lum']), color='blue', alpha=0.2)
ax.scatter(log10(d['seed_inv_mass']), log10(d['inv_lum']), color='red', alpha=0.2)
ax.scatter(log10(s['mass']), log10(s['lum']), color='gold', alpha=0.2)
ax.set_xlabel('Mass ($\log_{10} M_{\odot}$)')
ax.set_ylabel('Luminosity ($\log_{10} L_{\odot} / h^2$)')
ax.set_title('Seed %d' % i)
f.tight_layout()
In [12]:
f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
d = datasets[i]
s = samplings[i]
ax.scatter(log10(d['mass']), log10(d['lum']), color='blue', alpha=0.005)
ax.scatter(log10(d['seed_inv_mass']), log10(d['inv_lum']), color='red', alpha=0.005)
ax.scatter(log10(s['mass']), log10(s['lum']), color='gold', alpha=0.005)
ax.set_xlabel('Mass ($\log_{10} M_{\odot}$)')
ax.set_ylabel('Luminosity ($\log_{10} L_{\odot} / h^2$)')
ax.set_title('Seed %d' % i)
f.tight_layout()
See in the data that should be 5 sigma. Going to fix from now on.
In [13]:
for d in datasets:
fixed_seed = 0
alpha1 = datasets[fixed_seed]['alpha1']
alpha2 = datasets[fixed_seed]['alpha2']
alpha3 = datasets[fixed_seed]['alpha3']
alpha4 = datasets[fixed_seed]['alpha4']
S = datasets[fixed_seed]['S'] * 5 # here is where we multiply by five now
d['5S_seed_inv_mass_mu'] = alpha3 * (d['inv_lum'] / (np.exp(alpha1) * (1 + d['z']) ** alpha4)) ** (1 / alpha2)
d['5S_seed_inv_mass'] = lognorm(S, scale=d['seed_inv_mass_mu']).rvs()
f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
d = datasets[i]
s = samplings[i]
ax.scatter(log10(d['mass']), log10(d['lum']), color='blue', alpha=0.2)
ax.scatter(log10(d['5S_seed_inv_mass']), log10(d['inv_lum']), color='red', alpha=0.2)
ax.scatter(log10(s['mass']), log10(s['lum']), color='gold', alpha=0.2)
ax.set_xlabel('Mass ($\log_{10} M_{\odot}$)')
ax.set_ylabel('Luminosity ($\log_{10} L_{\odot} / h^2$)')
ax.set_title('Seed %d' % i)
f.tight_layout()
In [14]:
f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True)
for i, ax in enumerate(axes.flat):
d = datasets[i]
s = samplings[i]
ax.scatter(log10(d['mass']), log10(d['lum']), color='blue', alpha=0.005)
ax.scatter(log10(d['5S_seed_inv_mass']), log10(d['inv_lum']), color='red', alpha=0.005)
ax.scatter(log10(s['mass']), log10(s['lum']), color='gold', alpha=0.005)
ax.set_xlabel('Mass ($\log_{10} M_{\odot}$)')
ax.set_ylabel('Luminosity ($\log_{10} L_{\odot} / h^2$)')
ax.set_title('Seed %d' % i)
f.tight_layout()
In [15]:
datasets[0]
Out[15]:
In [16]:
for z in np.linspace(0, 3.5, 10):
h = 0.73
Mmin = 10.2358590918
Mmax = 14.3277327776
mf = hmf.MassFunction(z=z, Mmin=Mmin, Mmax=Mmax, cosmo_model=hmf.cosmo.WMAP5, hmf_model=hmf.fitting_functions.Tinker10)
plt.plot(mf.m, mf.dndm, label='z={}'.format(str(z)[:4]))
plt.ylabel('Density')
plt.xlabel('Mass $M_{\odot}$')
plt.gca().set_yscale('log')
plt.legend()
plt.title('Halo Mass Function At Various Redshifts');
In [17]:
for z in np.linspace(0, 3.5, 10):
h = 0.73
Mmin = 10.2358590918
Mmax = 14.3277327776
mf = hmf.MassFunction(z=z, Mmin=Mmin, Mmax=Mmax, cosmo_model=hmf.cosmo.WMAP5, hmf_model=hmf.fitting_functions.Tinker10)
n = 30
plt.plot(mf.m[:n], mf.dndm[:n], label='z={}'.format(str(z)[:4]))
plt.ylabel('Density')
plt.xlabel('Mass $M_{\odot}$')
plt.legend()
plt.title('Halo Mass Function At Various Redshifts');
In [18]:
import hmf
import scipy.interpolate as interpolate
import pandas as pd
import numpy as np
from scipy.stats import lognorm, norm
class Grid():
"""
Manages redshift bins.
"""
def __init__(self, mmin=0, mmax=3.5, nbins=20):
self.redshifts = np.linspace(mmin, mmax, nbins)
self.nbins = nbins
def snap(self, z):
ind = np.minimum(self.nbins - 1, np.searchsorted(self.redshifts, z))
return self.redshifts[ind]
class MassPrior():
"""
Class to represent mass prior distribution.
Initialize with mass and probability arrays.
Can both evaluate and sample from distribution.
"""
def __init__(self, mass, prob):
self.mass = mass
self.prob = prob
self.min = mass.min()
self.max = mass.max()
# have to add 0,1 samples for interpolation bounds
cumsum = np.append(np.append(np.array([0]), np.cumsum(prob)), np.array([1]))
masses = np.append(np.append(np.array([self.min - 1]), self.mass), np.array([self.max + 1]))
self.inv_cdf = interpolate.interp1d(cumsum, masses)
def pdf(self, mass):
ret = self._pdf(mass)
if (len(mass) > 1):
ret = np.prod(ret)
return ret
def logpdf(self, mass):
ret = np.log(self._pdf(mass))
if (len(mass) > 1):
ret = np.sum(ret)
return ret
def _pdf(self, mass):
if np.any(mass < self.min) or np.any(mass > self.max):
raise Exception('out of range')
right_ind = np.searchsorted(self.mass, mass)
left_ind = right_ind - 1
# find where we fall in interval between masses
f = (mass - self.mass[left_ind]) / (self.mass[right_ind] - self.mass[left_ind])
return f * self.prob[right_ind] + (1 - f) * self.prob[left_ind]
def rvs(self, size=1):
return self.inv_cdf(np.random.rand(size))
class TinkerPrior():
def __init__(self, grid, h = 0.73, Mmin = 10.2358590918, Mmax = 14.3277327776):
self.grid = grid
self.grid_to_prior = dict()
for z in grid.redshifts:
mf = hmf.MassFunction(z=z, Mmin=Mmin, Mmax=Mmax, \
cosmo_model=hmf.cosmo.WMAP5, \
hmf_model=hmf.fitting_functions.Tinker10)
self.grid_to_prior[z] = MassPrior(mf.m * h, mf.dndm / sum(mf.dndm))
def fetch(self, z):
return self.grid_to_prior[self.grid.snap(z)]
def pdf(self, mass, z):
ret = self.fetch(z)._pdf(mass)
if (len(mass) > 1):
ret = np.prod(ret)
return ret
def logpdf(self, mass, z):
ret = np.log(self.fetch(z)._pdf(mass))
if (len(mass) > 1):
ret = np.sum(ret)
return ret
def rvs(self, z, size=1):
return self.fetch(z).inv_cdf(np.random.rand(size))
class DataSamples():
def __init__(self, grid, mass, lum, z):
self.grid = grid
grid_to_masses = dict()
grid_to_luminosities = dict()
snapzs = grid.snap(z)
for snapz in grid.redshifts:
grid_to_masses[snapz] = []
grid_to_luminosities[snapz] = []
for i in xrange(len(mass)):
grid_to_masses[snapzs[i]].append(mass[i])
grid_to_luminosities[snapzs[i]].append(lum[i])
self.grid_to_samples = dict()
for snapz in grid.redshifts:
self.grid_to_samples[snapz] = np.array([grid_to_masses[snapz], \
grid_to_luminosities[snapz]]) \
.transpose()
def get_mass(self, z):
snapz = self.grid.snap(z)
return self.grid_to_samples[snapz][:, 0]
def get_lum(self, z):
snapz = self.grid.snap(z)
return self.grid_to_samples[snapz][:, 1]
class LikelihoodTest():
def __init__(self, grid, prior, samples, lum_obs, z):
self.grid = grid
self.prior = prior
self.samples = samples
self.lum_obs = lum_obs
self.z = z
self.snapz = grid.snap(z)
self.sigma_L = 0.05
def true_log_likelihood(self, alpha1, alpha2, alpha3, alpha4, S):
ret = 0
for i in xrange(len(self.lum_obs)):
if i % 1000 == 0:
print i
# 2d integral (mass, lum)
lum_obs = self.lum_obs[i]
n = len(lum_obs)
if n == 0:
continue
snapz = self.snapz[i]
mass = self.samples.get_mass(snapz)
lum_mu = np.exp(alpha1) * ((mass / alpha3) ** alpha2) * ((1 + snapz) ** alpha4)
lum = lognorm(S, scale=lum_mu).rvs(n)
val = np.sum(lognorm(self.sigma_L, scale=lum).pdf(lum_obs))
ret += np.log(val) - np.log(n)
return ret
def naive_log_likelihood(self, alpha1, alpha2, alpha3, alpha4, S):
raise NotImplemented()
def importance_sampling_log_likelihood(self, alpha1, alpha2, alpha3, alpha4, S):
raise NotImplemented()
# test
np.random.seed(0)
alpha1 = norm(10.709, 0.022).rvs()
alpha2 = norm(0.359, 0.009).rvs()
alpha3 = 2.35e14
alpha4 = norm(1.10, 0.06).rvs()
S = norm(0.155, 0.0009).rvs()
data = pd.read_csv('/Users/user/Code/PanglossNotebooks/MassLuminosityProject/mock_data.csv')
grid = Grid()
prior = TinkerPrior(grid)
samples = DataSamples(grid, data.mass.as_matrix(), data.lum.as_matrix(), data.z.as_matrix())
# test = LikelihoodTest(grid, prior, samples, data.lum_obs.as_matrix(), data.z.as_matrix())
# test.true_log_likelihood(alpha1, alpha2, alpha3, alpha4, S)
In [19]:
keys = np.sort(samples.grid_to_samples.keys())
counts = map(lambda k: samples.grid_to_samples[k].shape[0], keys)
plt.plot(keys, counts, 'o')
plt.xlabel('redshift')
plt.ylabel('sample count')
plt.title('Sample Distribution');
The likelihood under hyperparameters H for a single galaxy is
$$\iint P(L_{obs}|L)P(L|M,z,H),P(M|z)dMdL$$We can generate M-L samples from P(L|M,z,H),P(M|z). We use the true mass samples at fixed redshift and generate luminosities for each one. Lets call the set of these M-L samples $T(z,H)$, since the samples are a dependent on z and H. Then we use monte-carlo integration
$$\iint P(L_{obs}|L)dP(L,M|z,H) \sim \frac{1}{N_T} \sum_{M,L,L_{obs} \sim T(z,H)} P(L_{obs}|L)$$Generalizing this for each galaxy in G we have
$$\mathcal{L} \sim \prod_{g \in G} \left[\frac{1}{N_{T(z^g,H)}} \sum_{M^g,L^g,L_{obs}^g \sim T(z^g,H)} P(L_{obs}^g|L^g) \right]$$and taking the log
$$\mathcal{l} \sim \sum_{g \in G} -\log({N_{T(z^g,H)}}) + \log\left[ \sum_{M^g,L^g,L_{obs}^g \sim T(z^g,H)} P(L_{obs}^g|L^g) \right]$$... but then we never use 'true' L samples - gues this is ok.
The only difference is the sampling distribution, specifically how we sample mass. Now we draw mass from the tinker prior. Let $N(z^g,H)$ denote this sampling. Then we have
$$\mathcal{l} \sim \sum_{g \in G} -\log({N_{N(z^g,H)}}) + \log\left[ \sum_{M^g,L^g,L_{obs}^g \sim N(z^g,H)} P(L_{obs}^g|L^g) \right]$$Now we draw M-L tuples from a biased distribution which we must be able to evaluate $Q(M^g,L^g,L_{obs}^g,z^g,H)$ and generate samples $B(z^g,H)$ from. Then we have
$$\mathcal{l} \sim \sum_{g \in G} -\log({N_{B(z^g,H)}}) + \log\left[ \sum_{M^g,L^g,z^g,L_{obs}^g \sim B(z^g,H)} \frac{P(L_{obs}^g|L^g)P(L^g|M^g,z^g,H)P(M^g|z^g)}{Q(M^g,L^g,L_{obs}^g,z^g,H)} \right]$$Compute samples and weights for each redshift bin, then only thing that changes between galaxies in a bin is $P(L^g_{obs}|L^g)$. Iterate through one redshift bin at a time. This will reduce computation by factor of 2 (see exp below).
$$\mathcal{l} \sim \sum_{g \in G} -\log({N_{T(z^g,H)}}) + \log\left[ \sum_{M^g,L^g,L_{obs}^g \sim T(z^g,H)} P(L_{obs}^g|L^g) \right]$$
In [20]:
mass = data.mass
snapz = 10
lum_obs = data.lum_obs
%time lum_mu = np.exp(alpha1) * ((mass / alpha3) ** alpha2) * ((1 + snapz) ** alpha4)
%time lum = lognorm(S, scale=lum_mu).rvs()
%time val = lognorm(0.05, scale=lum).pdf(lum_obs)
In [21]:
%time dist = lognorm(0.05, scale=lum)
%time val = dist.pdf(lum_obs);
In [22]:
%time dist = norm(lum)
%time val = dist.pdf(lum_obs);
In [23]:
%time val = lum_obs * lum_obs
In [29]:
import numpy as np
from numba import jit
a = np.random.rand(10 ** 6)
b = np.random.rand(10 ** 6)
c = np.random.rand(10 ** 6)
@jit
def foo(x):
def fool(x):
tmp = b + x
tmp2 = tmp ** 2
return tmp2
%time x = fool(c);
%time y = foo(c);
Motivation for using numba:
In [37]:
import numpy as np
from numba import jit
nobs = 10 ** 7
def proc_numpy(x,y,z):
x = x*2 - ( y * 55 ) # these 4 lines represent use cases
y = x + y*2 # where the processing time is mostly
z = x + y + 99 # a function of, say, 50 to 200 lines
z = z * ( z - .88 ) # of fairly simple numerical operations
return z
@jit(nogil=True, nopython=True, cache=True)
def proc_numba(xx,yy,zz):
for j in range(nobs): # as pointed out by Llopis, this for loop
x, y = xx[j], yy[j] # is not needed here. it is here by
# accident because in the original benchmarks
x = x*2 - ( y * 55 ) # I was doing data creation inside the function
y = x + y*2 # instead of passing it in as an array
z = x + y + 99 # in any case, this redundant code seems to
z = z * ( z - .88 ) # have something to do with the code running
# faster. without the redundant code, the
zz[j] = z # numba and numpy functions are exactly the same.
return zz
x = np.random.randn(nobs)
y = np.random.randn(nobs)
z = np.zeros(nobs)
%time res_numpy = proc_numpy(x,y,z)
z = np.zeros(nobs)
%time res_numba = proc_numba(x,y,z)
This is why I am holding off from using logsumexp for now:
In [111]:
from scipy.misc import logsumexp
for i in xrange(8):
r = np.random.rand(10**i)
print i
%time tmp = np.log(np.sum(r))
%time tmp = logsumexp(np.log(r));
%time tmp = np.log(np.sum(np.exp(np.log(r))))
In [121]:
times1 = [68e-6, 50e-6, 38e-6, 87e-6, 34e-6, 123e-6, 645e-6, 14e-3]
times2 = [201e-6, 216e-6, 226e-6, 394e-6, 599e-6, 2.6e-3, 44.7e-3, 406e-3]
times3 = [64e-6, 55e-6, 59e-6, 55e-6, 224e-6, 2.09e-3, 33.6e-3, 234e-3]
size = map(lambda x : 10**x , range(8))
plt.plot(size, times1, label='np.log(np.sum(r))')
plt.plot(size, times2, label='logsumexp(np.log(r))')
plt.plot(size, times3, label='np.log(np.sum(np.exp(np.log(r))))')
plt.gca().set_yscale('log')
plt.gca().set_xscale('log')
plt.xlabel('input size')
plt.ylabel('time (seconds)')
for i in xrange(len(counts)):
if counts[i] < 1:
continue
plt.gca().axvline(counts[i], color='k', alpha=0.4)
plt.legend(loc=2)
plt.title('Scaling scipy.stats.lognorm');
Not seeing any performance improvement with the following:
@jit(nogil=True, cache=True)
def true_log_likelihood(self, alpha1, alpha2, alpha3, alpha4, S):
ret = 0
for i in xrange(len(self.lum_obs)):
# if i % 1000 == 0:
# print i
# 2d integral (mass, lum)
lum_obs = self.lum_obs[i]
snapz = self.snapz[i]
mass = self.samples.get_mass(snapz)
n = len(mass)
lum_mu = np.exp(alpha1) * ((mass / alpha3) ** alpha2) * ((1 + snapz) ** alpha4)
lum = lognorm(S, scale=lum_mu).rvs(n)
val = np.sum(lognorm(self.sigma_L, scale=lum).pdf(lum_obs))
ret += np.log(val) - np.log(n)
return ret
This is largely because this block depends on python objects from numpy, scipy and so numba compiles in object mode as opposed to the much faster 'nopython' mode.
Timer unit: 1e-06 s
Total time: 9.74685 s
File: /Users/user/Code/Scratch/first_likelihood.py
Function: true_log_likelihood at line 131
Line # Hits Time Per Hit % Time Line Contents
==============================================================
131 @profile
132 def true_log_likelihood(self, alpha1, alpha2, alpha3, alpha4, S):
133 1 2 2.0 0.0 ret = 0
134 4001 3511 0.9 0.0 for i in xrange(4*10**3):#len(self.lum_obs)):
135 4000 6118 1.5 0.1 lum_obs = self.lum_obs[i]
136 4000 3570 0.9 0.0 snapz = self.snapz[i]
137 4000 61314 15.3 0.6 lum = self.samples.get_lum(snapz)
138 4000 9638661 2409.7 98.9 val = np.sum(lognorm(self.sigma_L, scale=lum).pdf(lum_obs))
139 4000 33671 8.4 0.3 ret += np.log(val) - np.log(len(lum))
140 1 0 0.0 0.0 return ret
We are spending most of our time in the lognormal pdf evaluation. Below I show that the lognormal pdf evaluation is fairly well vectorized by showing that the ratio of runtimes of array addition and lognormal pdf evaluation (96) is about the same as the ratio of runtimes of single number addition and lognormal pdf evaluation (80).
In [81]:
a = np.random.rand(10 ** 7)
%time dist = lognorm(0.5, scale=a)
%time dist.pdf(0.5)
%time a + a
Out[81]:
In [73]:
%time dist = lognorm(0.5, 0.5)
%time dist.pdf(0.5)
%time 0.5 + 0.5
Out[73]:
In [77]:
160/2
Out[77]:
In [76]:
(2.04 / 21.1) * 10 ** 3
Out[76]:
Next I tried significantly lowering the samples per redshift bin. I brought them down from there true values (approx $10^3-10^4$, see plot above) to 10. When I reran the code I only saw a performance improvement by a factor of $\textbf{2}$! Line profiling confirms that over 98% of the time is still spent in scipy.stats.lognormal.
Line # Hits Time Per Hit % Time Line Contents
==============================================================
131 @profile
132 def true_log_likelihood(self, alpha1, alpha2, alpha3, alpha4, S):
133 1 2 2.0 0.0 ret = 0
134 40001 34984 0.9 0.0 for i in xrange(40*10**3):#len(self.lum_obs)):
135 40000 60362 1.5 0.1 lum_obs = self.lum_obs[i]
136 40000 34534 0.9 0.0 snapz = self.snapz[i]
137 40000 578745 14.5 0.8 lum = self.samples.get_lum(snapz)
138 40000 41626 1.0 0.1 if len(lum) == 0:
139 continue
140 40000 50288 1.3 0.1 lum = lum[:10]
141 40000 70418865 1760.5 98.5 val = np.sum(lognorm(self.sigma_L, scale=lum).pdf(lum_obs))
142 40000 300340 7.5 0.4 ret += np.log(val) - np.log(len(lum))
143 1 1 1.0 0.0 return ret
Next we time lognorm evaluations with different input sizes and plot them below. We also plot the redshift bin sizes to show that we cannot squeeze out much performance by shrinking the bin size. This is still consistent with the ratio test above because there we compared to input size of $10^7$ where we start to see linear scaling.
In [82]:
for i in xrange(8):
a = np.random.rand(10 ** i)
dist = lognorm(0.5, scale=a)
print i
%time dist.pdf(0.5)
In [102]:
times = [665e-6, 745e-6, 336e-6, 524e-6, 2.34e-3, 52.5e-3, 272e-3, 2.43]
size = map(lambda x : 10**x , range(8))
plt.plot(size, times)
plt.gca().set_yscale('log')
plt.gca().set_xscale('log')
plt.xlabel('input size')
plt.ylabel('time (seconds)')
for i in xrange(len(counts)):
if counts[i] < 1:
continue
plt.gca().axvline(counts[i], color='k', alpha=0.4)
plt.title('Scaling scipy.stats.lognorm');
In [123]:
print 'abridged back of the envelope runtime: {} minutes'.format((745e-6 * len(lum_obs) / 60))
print 'standard back of the envelope runtime: {} minutes'.format((2.34e-3 * len(lum_obs) / 60))
In [155]:
print 'abridged runtime: {} minutes'.format(140.984743834 / 60)
print 'runtime: {} minutes'.format(268.072308064 / 60)
We also end up with -inf answers suggesting that we may need to use logexpsum despite its poor performance.
In [150]:
class LogNorm():
def __init__(self, mu, sigma):
self.mu = mu
self.sigma = sigma
def pdf(self, x):
return (1 / (x * self.sigma * np.sqrt(2 * np.pi))) * np.exp(-0.5 * ((np.log(x) - np.log(self.mu)) ** 2 / self.sigma ** 2))
In [131]:
for i in xrange(8):
a = np.random.rand(10 ** i)
dist = LogNorm(0.5, a)
%time dist.pdf(0.5)
In [166]:
times1 = [665e-6, 745e-6, 336e-6, 524e-6, 2.34e-3, 52.5e-3, 272e-3, 2.43]
times2 = [115e-6, 86e-6, 131e-6, 60e-6, 418e-6, 2.34e-3, 43e-3, 372e-3]
size = map(lambda x : 10**x , range(8))
plt.plot(size, times1, label='scipy')
plt.plot(size, times2, label='raw')
plt.gca().set_yscale('log')
plt.gca().set_xscale('log')
plt.xlabel('input size')
plt.ylabel('time (seconds)')
for i in xrange(len(counts)):
if counts[i] < 1:
continue
plt.gca().axvline(counts[i], color='k', alpha=0.4)
plt.legend(loc=2)
plt.title('Scaling scipy.stats.lognorm');
In [153]:
#confirm same
print LogNorm(10,1).pdf(10)
print lognorm(1,scale=10).pdf(10)
After making this change the runtime has accelerated dramatically.
In [157]:
print 'new abridged runtime: {} minutes'.format(5.14367508888 / 60)
print 'new runtime: {} minutes'.format(26.4348368645 / 60)
New profile below:
Total time: 27.1258 s
File: /Users/user/Code/Scratch/first_likelihood.py
Function: true_log_likelihood at line 135
Line # Hits Time Per Hit % Time Line Contents
==============================================================
135 @profile
136 def true_log_likelihood(self, alpha1, alpha2, alpha3, alpha4, S):
137 1 3 3.0 0.0 ret = 0
138 115920 75309 0.6 0.3 for i in xrange(len(self.lum_obs)):
139 115919 118972 1.0 0.4 lum_obs = self.lum_obs[i]
140 115919 86693 0.7 0.3 snapz = self.snapz[i]
141 115919 1133916 9.8 4.2 lum = self.samples.get_lum(snapz)
142 115919 105045 0.9 0.4 if len(lum) == 0:
143 continue
144 115919 24918347 215.0 91.9 val = np.sum(lognorm_eval(lum, self.sigma_L, lum_obs))
145 115919 687488 5.9 2.5 ret += np.log(val) - np.log(len(lum))
146 1 0 0.0 0.0 return ret
Now we try with scipy.misc.logsumexp and lognorm_log_eval. Result is the exact same (-1096670.13887) but the logsumexp version is a bit slower.
Total time: 40.8048 s
File: /Users/user/Code/Scratch/first_likelihood.py
Function: true_log_likelihood at line 141
Line # Hits Time Per Hit % Time Line Contents
==============================================================
141 @profile
142 def true_log_likelihood(self, alpha1, alpha2, alpha3, alpha4, S):
143 1 3 3.0 0.0 ret = 0
144 115920 94799 0.8 0.2 for i in xrange(len(self.lum_obs)):
145 115919 152595 1.3 0.4 lum_obs = self.lum_obs[i]
146 115919 99825 0.9 0.2 snapz = self.snapz[i]
147 115919 1496449 12.9 3.7 lum = self.samples.get_lum(snapz)
148 115919 116669 1.0 0.3 if len(lum) == 0:
149 continue
150 115919 38199989 329.5 93.6 val = logsumexp(lognorm_log_eval(lum, self.sigma_L, lum_obs))
151 115919 644478 5.6 1.6 ret += val - np.log(len(lum))
152 1 1 1.0 0.0 return ret
In [171]:
import hmf
import scipy.interpolate as interpolate
import pandas as pd
import numpy as np
from scipy.stats import lognorm, norm
from scipy.misc import logsumexp
class Grid():
"""
Manages redshift bins.
"""
def __init__(self, mmin=0, mmax=3.5, nbins=20):
self.redshifts = np.linspace(mmin, mmax, nbins)
self.nbins = nbins
def snap(self, z):
ind = np.minimum(self.nbins - 1, np.searchsorted(self.redshifts, z))
return self.redshifts[ind]
class MassPrior():
"""
Class to represent mass prior distribution.
Initialize with mass and probability arrays.
Can both evaluate and sample from distribution.
"""
def __init__(self, mass, prob):
self.mass = mass
self.prob = prob
self.min = mass.min()
self.max = mass.max()
# have to add 0,1 samples for interpolation bounds
cumsum = np.append(np.append(np.array([0]), np.cumsum(prob)), np.array([1]))
masses = np.append(np.append(np.array([self.min - 1]), self.mass), np.array([self.max + 1]))
self.inv_cdf = interpolate.interp1d(cumsum, masses)
def pdf(self, mass):
ret = self._pdf(mass)
if (len(mass) > 1):
ret = np.prod(ret)
return ret
def logpdf(self, mass):
ret = np.log(self._pdf(mass))
if (len(mass) > 1):
ret = np.sum(ret)
return ret
def _pdf(self, mass):
if np.any(mass < self.min) or np.any(mass > self.max):
raise Exception('out of range')
right_ind = np.searchsorted(self.mass, mass)
left_ind = right_ind - 1
# find where we fall in interval between masses
f = (mass - self.mass[left_ind]) / (self.mass[right_ind] - self.mass[left_ind])
return f * self.prob[right_ind] + (1 - f) * self.prob[left_ind]
def rvs(self, size=1):
return self.inv_cdf(np.random.rand(size))
class TinkerPrior():
def __init__(self, grid, h = 0.73, Mmin = 10.2358590918, Mmax = 14.3277327776):
self.grid = grid
self.grid_to_prior = dict()
for z in grid.redshifts:
mf = hmf.MassFunction(z=z, Mmin=Mmin, Mmax=Mmax, \
cosmo_model=hmf.cosmo.WMAP5, \
hmf_model=hmf.fitting_functions.Tinker10)
self.grid_to_prior[z] = MassPrior(mf.m * h, mf.dndm / sum(mf.dndm))
def fetch(self, z):
return self.grid_to_prior[self.grid.snap(z)]
def pdf(self, mass, z):
ret = self.fetch(z)._pdf(mass)
if (len(mass) > 1):
ret = np.prod(ret)
return ret
def logpdf(self, mass, z):
ret = np.log(self.fetch(z)._pdf(mass))
if (len(mass) > 1):
ret = np.sum(ret)
return ret
def rvs(self, z, size=1):
return self.fetch(z).inv_cdf(np.random.rand(size))
class DataSamples():
def __init__(self, grid, mass, lum, z):
self.grid = grid
grid_to_masses = dict()
grid_to_luminosities = dict()
snapzs = grid.snap(z)
for snapz in grid.redshifts:
grid_to_masses[snapz] = []
grid_to_luminosities[snapz] = []
for i in xrange(len(mass)):
grid_to_masses[snapzs[i]].append(mass[i])
grid_to_luminosities[snapzs[i]].append(lum[i])
self.grid_to_samples = dict()
for snapz in grid.redshifts:
self.grid_to_samples[snapz] = np.array([grid_to_masses[snapz], \
grid_to_luminosities[snapz]]) \
.transpose()
def get_mass(self, z):
snapz = self.grid.snap(z)
return self.grid_to_samples[snapz][:, 0]
def get_lum(self, z):
snapz = self.grid.snap(z)
return self.grid_to_samples[snapz][:, 1]
def lognorm_eval(mu, sigma, x):
return (1 / (x * sigma * np.sqrt(2 * np.pi))) * np.exp(
-0.5 * ((np.log(x) - np.log(mu)) ** 2 / sigma ** 2))
def lognorm_log_eval(mu, sigma, x):
return -np.log(x * sigma * np.sqrt(2 * np.pi)) \
-0.5 * ((np.log(x) - np.log(mu)) ** 2 / sigma ** 2)
class LikelihoodTest(object):
def __init__(self, grid, prior, samples, lum_obs, z):
self.grid = grid
self.prior = prior
self.samples = samples
self.lum_obs = lum_obs
self.z = z
self.snapz = grid.snap(z)
self.sigma_L = 0.05
def true_log_likelihood(self):
ret = 0
for i in xrange(len(self.lum_obs)):
lum_obs = self.lum_obs[i]
snapz = self.snapz[i]
lum = self.samples.get_lum(snapz)
if len(lum) == 0:
continue
val = logsumexp(lognorm_log_eval(lum, self.sigma_L, lum_obs))
ret += val - np.log(len(lum))
return ret
def naive_log_likelihood(self):
raise NotImplemented()
def biased_log_likelihood(self, a1, a2, a3, a4, S):
ret = 0
for i in xrange(len(self.lum_obs)):
lum_obs = self.lum_obs[i]
snapz = self.snapz[i]
lum = self.samples.get_lum(snapz)
if len(lum) == 0:
continue
val = np.sum(lognorm(self.sigma_L, scale=lum).pdf(lum_obs))
ret += np.log(val) - np.log(len(lum))
return ret
In [172]:
data = pd.read_csv('/Users/user/Code/PanglossNotebooks/MassLuminosityProject/mock_data.csv')
grid = Grid()
prior = TinkerPrior(grid)
z = data.z.as_matrix()
lum_obs = data.lum_obs.as_matrix()
mass = data.mass.as_matrix()
for seed in xrange(6):
np.random.seed(seed)
alpha1 = norm(10.709, 0.022).rvs()
alpha2 = norm(0.359, 0.009).rvs()
alpha3 = 2.35e14
alpha4 = norm(1.10, 0.06).rvs()
S = norm(0.155, 0.0009).rvs()
S_M = 5.6578015811698101 * S
sigma_L = 0.05
mu_lum = np.exp(alpha1) * ((mass / alpha3) ** (alpha2))* ((1+z) ** (alpha4))
lum = lognorm(S, scale=mu_lum).rvs()
samples = DataSamples(grid, mass, lum, z)
test = LikelihoodTest(grid, prior, samples, lum_obs, z)
log_like = test.true_log_likelihood()
print 'seed: {}, log-likelihood: {}'.format(seed, log_like)
In [183]:
size = 10**6
sigma = .1
mean = 10
%time a = np.random.lognormal(mean=np.log(mean), sigma=sigma, size=size)
%time b = lognorm.rvs(sigma, scale=mean, size=size)
In [184]:
plt.hist(a, alpha=0.2)
plt.hist(b, alpha=0.2, color='red')
Out[184]:
In [185]:
a.mean()
Out[185]:
In [186]:
b.mean()
Out[186]: