The PGM

For more an introduction to PGMS see Daphne Koller's Probabilistic Graphical Models. Below is the PGM that we will explore in this notebook.



In [1]:

%matplotlib inline

from matplotlib import rc
rc("font", family="serif", size=14)
rc("text", usetex=True)

import daft

pgm = daft.PGM([7, 6], origin=[0, 0])

#background nodes
pgm.add_plate(daft.Plate([0.5, 3.0, 5, 2], label=r"foreground galaxy $i$",
shift=-0.1))
pgm.add_node(daft.Node("theta", r"$\theta$", 3.5, 5.5, fixed=True))
pgm.add_node(daft.Node("alpha", r"$\alpha$", 1.5, 5.5, fixed=True))
pgm.add_node(daft.Node("halo_mass", r"$M_i$", 3.5, 4, scale=2))
pgm.add_node(daft.Node("background_z", r"$z_i$", 2, 4, fixed=True))
pgm.add_node(daft.Node("concentration", r"$c_i$", 1.5, 3.5, fixed=True))
pgm.add_node(daft.Node("background_x", r"$x_i$", 1.0, 3.5, fixed=True))

#foreground nodes
pgm.add_plate(daft.Plate([0.5, 0.5, 5, 2], label=r"background galaxy $j$",
shift=-0.1))
pgm.add_node(daft.Node("reduced_shear", r"$g_j$", 2.0, 1.5, fixed=True))
pgm.add_node(daft.Node("reduced_shear", r"$g_j$", 2.0, 1.5, fixed=True))
pgm.add_node(daft.Node("foreground_z", r"$z_j$", 1.0, 1.5, fixed=True))
pgm.add_node(daft.Node("foreground_x", r"$x_j$", 1.0, 1.0, fixed=True))
pgm.add_node(daft.Node("ellipticities", r"$\epsilon_j^{obs}$", 4.5, 1.5, observed=True, scale=2))

#outer nodes
pgm.add_node(daft.Node("sigma_obs", r"$\sigma_{\epsilon_j}^{obs}$", 3.0, 2.0, fixed=True))
pgm.add_node(daft.Node("sigma_int", r"$\sigma_{\epsilon}^{int}$", 6.0, 1.5, fixed=True))

#edges
pgm.render()




Out[1]:

<matplotlib.axes._axes.Axes at 0x112aaf3d0>



We have sets of foregrounds and backgrounds along with the variables

• $\alpha$: parameters in the concentration function (which is a function of $z_i,M_i$)
• $\theta$: prior distribution of halo masses
• $z_i$: foreground galaxy redshift
• $x_i$: foreground galaxy angular coordinates
• $z_j$: background galaxy redshift
• $x_j$: background galaxy angular coordinates
• $g_j$: reduced shear
• $\sigma_{\epsilon_j}^{obs}$: noise from our ellipticity measurement process
• $\sigma_{\epsilon}^{int}$: intrinsic variance in ellipticities
• $\epsilon_j^{obs}$: intrinsic variance in ellipticities

Stellar Mass Threshold



In [2]:

from pangloss import GUO_FILE

m_h = 'M_Subhalo[M_sol/h]'
m_s = 'M_Stellar[M_sol/h]'

nonzero_guo_data= guo_data[guo_data[m_h] > 0]




In [5]:

import matplotlib.pyplot as plt

stellar_mass_threshold = 5.883920e+10
plt.scatter(nonzero_guo_data[m_h], nonzero_guo_data[m_s], alpha=0.05)
plt.axhline(y=stellar_mass_threshold, color='red')
plt.xlabel('Halo Mass')
plt.ylabel('Stellar Mass')
plt.title('SMHM Scatter')
plt.xscale('log')
plt.yscale('log')







In [7]:

from math import log
import numpy as np

start = log(nonzero_guo_data[m_s].min(), 10)
stop = log(nonzero_guo_data[m_s].max(), 10)

m_logspace = np.logspace(start, stop, num=20, base=10)[:-1]

m_corrs = []
thin_data = nonzero_guo_data[[m_s, m_h]]
for cutoff in m_logspace:
tmp = thin_data[nonzero_guo_data[m_s] > cutoff]
m_corrs.append(tmp.corr()[m_s][1])

plt.plot(m_logspace, m_corrs, label='correlation')
plt.axvline(x=stellar_mass_threshold, color='red', label='threshold')
plt.xscale('log')
plt.legend(loc=2)
plt.xlabel('Stellar Mass')
plt.ylabel('Stellar Mass - Halo Mass Correlation')
plt.title('SMHM Correlation')




Out[7]:

<matplotlib.text.Text at 0x1116e76d0>




In [8]:

plt.rcParams['figure.figsize'] = (10, 6)
# plt.plot(hist[1][:-1], hist[0], label='correlation')
plt.hist(nonzero_guo_data[m_s], bins=m_logspace, alpha=0.4, normed=False, label='dataset')
plt.axvline(x=stellar_mass_threshold, color='red', label='threshold')
plt.xscale('log')
plt.legend(loc=2)
plt.xlabel('Stellar Mass')
plt.ylabel('Number of Samples')
plt.title('Stellar Mass Distribution')




Out[8]:



Results



In [60]:




In [61]:

start = min([res[res[c] > 0][c].min() for c in res.columns[1:-1]])
stop = res.max().max()




In [62]:

base = 10
start = log(start, base)
end = log(stop, base)
res_logspace = np.logspace(start, end, num=10, base=base)




In [63]:

plt.rcParams['figure.figsize'] = (20, 12)

for i,val in enumerate(tru.columns[1:]):
plt.subplot(int('91' + str(i+1)))
x = res[val][res[val] > 0]
weights = np.exp(res['log-likelihood'][res[val] > 0])
t = tru[val].loc[0]
plt.hist(x, bins=res_logspace, alpha=0.4, normed=True, label='prior')
plt.hist(x, bins=res_logspace, weights=weights, alpha=0.4, normed=True, label='posterior')
plt.axvline(x=t, color='red', label='truth', linewidth=1)
plt.xscale('log')
plt.legend()
plt.ylabel('PDF')
plt.xlabel('Halo Mass (log-scale)')
plt.title('Halo ID ' + val)

plt.show()







In [52]:

res.columns




Out[52]:

Index([u'Unnamed: 0', u'112009306000027', u'37000046000024', u'37014522009504',
u'419000237000015', u'436017617000017', u'456000089000011',
u'456000089002589', u'65000614000013', u'74004925000017',
u'82020319006828', u'97000066000025', u'log-likelihood'],
dtype='object')




In [57]:

res[['112009306000027', 'log-likelihood']].sort('log-likelihood')




/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
if __name__ == '__main__':

Out[57]:

112009306000027
log-likelihood

977
8.576450e+12
-178.423575

986
1.501850e+13
-178.183221

985
-1.000000e+00
-178.124869

975
1.488250e+13
-178.044443

987
1.779580e+13
-178.024270

978
-1.000000e+00
-178.020655

980
1.150700e+13
-177.748564

974
9.686690e+12
-177.729791

976
-1.000000e+00
-177.704476

988
7.191650e+12
-177.587342

982
1.783880e+13
-177.492953

979
5.383410e+12
-177.490239

981
2.985270e+13
-177.484051

968
4.130290e+12
-177.359943

992
-1.000000e+00
-177.136031

983
5.783620e+12
-177.124976

989
8.802800e+12
-176.969482

991
1.322140e+13
-176.937045

984
2.709090e+13
-176.839741

997
1.605640e+13
-176.761415

990
6.811240e+12
-176.725611

969
5.423860e+12
-176.690448

993
7.386160e+12
-176.629576

973
4.645830e+12
-176.530020

971
1.037690e+13
-176.519389

994
4.182790e+12
-176.397161

999
2.786810e+12
-176.328150

998
3.411640e+12
-176.198665

996
2.347530e+13
-176.182316

967
2.577670e+12
-176.108291

...
...
...

29
2.736890e+12
-58.037736

23
4.658740e+12
-57.606550

24
8.422390e+12
-56.916415

27
2.633610e+12
-56.572011

28
4.976320e+12
-55.627443

26
2.391770e+12
-55.161438

20
4.558560e+13
-54.104697

25
1.276870e+13
-53.493563

21
5.824670e+13
-53.163409

22
1.514250e+14
-52.878466

19
-1.000000e+00
-52.073758

18
1.175400e+13
-49.463063

17
3.510880e+13
-46.778380

16
-1.000000e+00
-46.161299

15
9.756410e+12
-42.602203

14
5.296310e+13
-40.795944

9
1.147690e+13
-40.209745

11
-1.000000e+00
-38.806833

10
-1.000000e+00
-38.801589

12
2.993370e+12
-38.623613

8
4.419470e+12
-38.259628

13
8.964600e+12
-37.579700

7
-1.000000e+00
-33.722924

6
-1.000000e+00
-28.659743

5
2.347530e+13
-27.809463

4
1.865390e+13
-22.064582

3
3.173240e+13
-16.791430

2
-1.000000e+00
-13.621775

1
1.015400e+13
-8.590054

0
2.489620e+13
-0.755797

1000 rows × 2 columns



Next Steps

Move forward with major software upgrades

• Flying blind, guessing what will be important in future
• Reduce duplication
• Parallelize
• Slowly transition code base to better software practices
• Eliminate prototype code that is not core to module
• At some point it becomes cheaper to rewrite code with high parallelization and good software practices instead of hacking in extra functinality to code that is poorly suited to our goal

Pivot to new question/milestone

• Shrink foreground
• See if results improve with 'unphysical', dense background
• Hit the literature to see what questions have been answered and get ideas for cool project directions
• Thanks Warren for pointing me to resources for learning weak lensing


In [ ]: