The PGM

For more an introduction to PGMS see Daphne Koller's Probabilistic Graphical Models. Below is the PGM that we will explore in this notebook.



In [1]:

    
%matplotlib inline

from matplotlib import rc
rc("font", family="serif", size=14)
rc("text", usetex=True)

import daft

pgm = daft.PGM([7, 6], origin=[0, 0])

#background nodes
pgm.add_plate(daft.Plate([0.5, 3.0, 5, 2], label=r"foreground galaxy $i$",
    shift=-0.1))
pgm.add_node(daft.Node("theta", r"$\theta$", 3.5, 5.5, fixed=True))
pgm.add_node(daft.Node("alpha", r"$\alpha$", 1.5, 5.5, fixed=True))
pgm.add_node(daft.Node("halo_mass", r"$M_i$", 3.5, 4, scale=2))
pgm.add_node(daft.Node("background_z", r"$z_i$", 2, 4, fixed=True))
pgm.add_node(daft.Node("concentration", r"$c_i$", 1.5, 3.5, fixed=True))
pgm.add_node(daft.Node("background_x", r"$x_i$", 1.0, 3.5, fixed=True))

#foreground nodes
pgm.add_plate(daft.Plate([0.5, 0.5, 5, 2], label=r"background galaxy $j$",
    shift=-0.1))
pgm.add_node(daft.Node("reduced_shear", r"$g_j$", 2.0, 1.5, fixed=True))
pgm.add_node(daft.Node("reduced_shear", r"$g_j$", 2.0, 1.5, fixed=True))
pgm.add_node(daft.Node("foreground_z", r"$z_j$", 1.0, 1.5, fixed=True))
pgm.add_node(daft.Node("foreground_x", r"$x_j$", 1.0, 1.0, fixed=True))
pgm.add_node(daft.Node("ellipticities", r"$\epsilon_j^{obs}$", 4.5, 1.5, observed=True, scale=2))

#outer nodes
pgm.add_node(daft.Node("sigma_obs", r"$\sigma_{\epsilon_j}^{obs}$", 3.0, 2.0, fixed=True))
pgm.add_node(daft.Node("sigma_int", r"$\sigma_{\epsilon}^{int}$", 6.0, 1.5, fixed=True))

#edges
pgm.add_edge("foreground_z", "reduced_shear")
pgm.add_edge("foreground_x", "reduced_shear")
pgm.add_edge("reduced_shear", "ellipticities")
pgm.add_edge("sigma_obs", "ellipticities")
pgm.add_edge("sigma_int", "ellipticities")
pgm.add_edge("concentration", "reduced_shear")
pgm.add_edge("halo_mass", "concentration")
pgm.add_edge("background_z", "concentration")
pgm.add_edge("background_x", "reduced_shear")
pgm.add_edge("alpha", "concentration")
pgm.add_edge("theta", "halo_mass")
pgm.render()









    Out[1]:





<matplotlib.axes._axes.Axes at 0x112aaf3d0>

We have sets of foregrounds and backgrounds along with the variables

$\alpha$: parameters in the concentration function (which is a function of $z_i,M_i$)
$\theta$: prior distribution of halo masses
$z_i$: foreground galaxy redshift
$x_i$: foreground galaxy angular coordinates
$z_j$: background galaxy redshift
$x_j$: background galaxy angular coordinates
$g_j$: reduced shear
$\sigma_{\epsilon_j}^{obs}$: noise from our ellipticity measurement process
$\sigma_{\epsilon}^{int}$: intrinsic variance in ellipticities
$\epsilon_j^{obs}$: intrinsic variance in ellipticities

Stellar Mass Threshold



In [2]:

    
from pandas import read_table
from pangloss import GUO_FILE

m_h = 'M_Subhalo[M_sol/h]'
m_s = 'M_Stellar[M_sol/h]'

guo_data = read_table(GUO_FILE)
nonzero_guo_data= guo_data[guo_data[m_h] > 0]



In [5]:

    
import matplotlib.pyplot as plt

stellar_mass_threshold = 5.883920e+10
plt.scatter(nonzero_guo_data[m_h], nonzero_guo_data[m_s], alpha=0.05)
plt.axhline(y=stellar_mass_threshold, color='red')
plt.xlabel('Halo Mass')
plt.ylabel('Stellar Mass')
plt.title('SMHM Scatter')
plt.xscale('log')
plt.yscale('log')



In [7]:

    
from math import log
import numpy as np

start = log(nonzero_guo_data[m_s].min(), 10)
stop = log(nonzero_guo_data[m_s].max(), 10)

m_logspace = np.logspace(start, stop, num=20, base=10)[:-1]

m_corrs = []
thin_data = nonzero_guo_data[[m_s, m_h]]
for cutoff in m_logspace:
    tmp = thin_data[nonzero_guo_data[m_s] > cutoff]
    m_corrs.append(tmp.corr()[m_s][1])
    
plt.plot(m_logspace, m_corrs, label='correlation')
plt.axvline(x=stellar_mass_threshold, color='red', label='threshold')
plt.xscale('log')
plt.legend(loc=2)
plt.xlabel('Stellar Mass')
plt.ylabel('Stellar Mass - Halo Mass Correlation')
plt.title('SMHM Correlation')









    Out[7]:





<matplotlib.text.Text at 0x1116e76d0>



In [8]:

    
plt.rcParams['figure.figsize'] = (10, 6)
# plt.plot(hist[1][:-1], hist[0], label='correlation')
plt.hist(nonzero_guo_data[m_s], bins=m_logspace, alpha=0.4, normed=False, label='dataset')
plt.axvline(x=stellar_mass_threshold, color='red', label='threshold')
plt.xscale('log')
plt.legend(loc=2)
plt.xlabel('Stellar Mass')
plt.ylabel('Number of Samples')
plt.title('Stellar Mass Distribution')









    Out[8]:





<matplotlib.text.Text at 0x117168ad0>

Results



In [60]:

    
from pandas import read_csv

res = read_csv('data3.csv')
tru = read_csv('true3.csv')



In [61]:

    
start = min([res[res[c] > 0][c].min() for c in res.columns[1:-1]])
stop = res.max().max()



In [62]:

    
base = 10
start = log(start, base)
end = log(stop, base)
res_logspace = np.logspace(start, end, num=10, base=base)



In [63]:

    
plt.rcParams['figure.figsize'] = (20, 12)

for i,val in enumerate(tru.columns[1:]):
    plt.subplot(int('91' + str(i+1)))
    x = res[val][res[val] > 0]
    weights = np.exp(res['log-likelihood'][res[val] > 0])
    t = tru[val].loc[0]
    plt.hist(x, bins=res_logspace, alpha=0.4, normed=True, label='prior')
    plt.hist(x, bins=res_logspace, weights=weights, alpha=0.4, normed=True, label='posterior')
    plt.axvline(x=t, color='red', label='truth', linewidth=1)
    plt.xscale('log')
    plt.legend()
    plt.ylabel('PDF')
    plt.xlabel('Halo Mass (log-scale)')
    plt.title('Halo ID ' + val)
    
plt.show()



In [52]:

    
res.columns









    Out[52]:





Index([u'Unnamed: 0', u'112009306000027', u'37000046000024', u'37014522009504',
       u'419000237000015', u'436017617000017', u'456000089000011',
       u'456000089002589', u'65000614000013', u'74004925000017',
       u'82020319006828', u'97000066000025', u'log-likelihood'],
      dtype='object')



In [57]:

    
res[['112009306000027', 'log-likelihood']].sort('log-likelihood')









    



/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':






    Out[57]:






  
    
      
      112009306000027
      log-likelihood
    
  
  
    
      977
      8.576450e+12
      -178.423575
    
    
      986
      1.501850e+13
      -178.183221
    
    
      985
      -1.000000e+00
      -178.124869
    
    
      975
      1.488250e+13
      -178.044443
    
    
      987
      1.779580e+13
      -178.024270
    
    
      978
      -1.000000e+00
      -178.020655
    
    
      980
      1.150700e+13
      -177.748564
    
    
      974
      9.686690e+12
      -177.729791
    
    
      976
      -1.000000e+00
      -177.704476
    
    
      988
      7.191650e+12
      -177.587342
    
    
      982
      1.783880e+13
      -177.492953
    
    
      979
      5.383410e+12
      -177.490239
    
    
      981
      2.985270e+13
      -177.484051
    
    
      968
      4.130290e+12
      -177.359943
    
    
      992
      -1.000000e+00
      -177.136031
    
    
      983
      5.783620e+12
      -177.124976
    
    
      989
      8.802800e+12
      -176.969482
    
    
      991
      1.322140e+13
      -176.937045
    
    
      984
      2.709090e+13
      -176.839741
    
    
      997
      1.605640e+13
      -176.761415
    
    
      990
      6.811240e+12
      -176.725611
    
    
      969
      5.423860e+12
      -176.690448
    
    
      993
      7.386160e+12
      -176.629576
    
    
      973
      4.645830e+12
      -176.530020
    
    
      971
      1.037690e+13
      -176.519389
    
    
      994
      4.182790e+12
      -176.397161
    
    
      999
      2.786810e+12
      -176.328150
    
    
      998
      3.411640e+12
      -176.198665
    
    
      996
      2.347530e+13
      -176.182316
    
    
      967
      2.577670e+12
      -176.108291
    
    
      ...
      ...
      ...
    
    
      29
      2.736890e+12
      -58.037736
    
    
      23
      4.658740e+12
      -57.606550
    
    
      24
      8.422390e+12
      -56.916415
    
    
      27
      2.633610e+12
      -56.572011
    
    
      28
      4.976320e+12
      -55.627443
    
    
      26
      2.391770e+12
      -55.161438
    
    
      20
      4.558560e+13
      -54.104697
    
    
      25
      1.276870e+13
      -53.493563
    
    
      21
      5.824670e+13
      -53.163409
    
    
      22
      1.514250e+14
      -52.878466
    
    
      19
      -1.000000e+00
      -52.073758
    
    
      18
      1.175400e+13
      -49.463063
    
    
      17
      3.510880e+13
      -46.778380
    
    
      16
      -1.000000e+00
      -46.161299
    
    
      15
      9.756410e+12
      -42.602203
    
    
      14
      5.296310e+13
      -40.795944
    
    
      9
      1.147690e+13
      -40.209745
    
    
      11
      -1.000000e+00
      -38.806833
    
    
      10
      -1.000000e+00
      -38.801589
    
    
      12
      2.993370e+12
      -38.623613
    
    
      8
      4.419470e+12
      -38.259628
    
    
      13
      8.964600e+12
      -37.579700
    
    
      7
      -1.000000e+00
      -33.722924
    
    
      6
      -1.000000e+00
      -28.659743
    
    
      5
      2.347530e+13
      -27.809463
    
    
      4
      1.865390e+13
      -22.064582
    
    
      3
      3.173240e+13
      -16.791430
    
    
      2
      -1.000000e+00
      -13.621775
    
    
      1
      1.015400e+13
      -8.590054
    
    
      0
      2.489620e+13
      -0.755797
    
  

1000 rows × 2 columns

Next Steps

Move forward with major software upgrades

Flying blind, guessing what will be important in future
Add importance sampling
Reduce duplication
Parallelize
Slowly transition code base to better software practices
Eliminate prototype code that is not core to module
At some point it becomes cheaper to rewrite code with high parallelization and good software practices instead of hacking in extra functinality to code that is poorly suited to our goal

Pivot to new question/milestone

Shrink foreground
See if results improve with 'unphysical', dense background
Hit the literature to see what questions have been answered and get ideas for cool project directions
Thanks Warren for pointing me to resources for learning weak lensing



In [ ]:

	112009306000027	log-likelihood
977	8.576450e+12	-178.423575
986	1.501850e+13	-178.183221
985	-1.000000e+00	-178.124869
975	1.488250e+13	-178.044443
987	1.779580e+13	-178.024270
978	-1.000000e+00	-178.020655
980	1.150700e+13	-177.748564
974	9.686690e+12	-177.729791
976	-1.000000e+00	-177.704476
988	7.191650e+12	-177.587342
982	1.783880e+13	-177.492953
979	5.383410e+12	-177.490239
981	2.985270e+13	-177.484051
968	4.130290e+12	-177.359943
992	-1.000000e+00	-177.136031
983	5.783620e+12	-177.124976
989	8.802800e+12	-176.969482
991	1.322140e+13	-176.937045
984	2.709090e+13	-176.839741
997	1.605640e+13	-176.761415
990	6.811240e+12	-176.725611
969	5.423860e+12	-176.690448
993	7.386160e+12	-176.629576
973	4.645830e+12	-176.530020
971	1.037690e+13	-176.519389
994	4.182790e+12	-176.397161
999	2.786810e+12	-176.328150
998	3.411640e+12	-176.198665
996	2.347530e+13	-176.182316
967	2.577670e+12	-176.108291
...	...	...
29	2.736890e+12	-58.037736
23	4.658740e+12	-57.606550
24	8.422390e+12	-56.916415
27	2.633610e+12	-56.572011
28	4.976320e+12	-55.627443
26	2.391770e+12	-55.161438
20	4.558560e+13	-54.104697
25	1.276870e+13	-53.493563
21	5.824670e+13	-53.163409
22	1.514250e+14	-52.878466
19	-1.000000e+00	-52.073758
18	1.175400e+13	-49.463063
17	3.510880e+13	-46.778380
16	-1.000000e+00	-46.161299
15	9.756410e+12	-42.602203
14	5.296310e+13	-40.795944
9	1.147690e+13	-40.209745
11	-1.000000e+00	-38.806833
10	-1.000000e+00	-38.801589
12	2.993370e+12	-38.623613
8	4.419470e+12	-38.259628
13	8.964600e+12	-37.579700
7	-1.000000e+00	-33.722924
6	-1.000000e+00	-28.659743
5	2.347530e+13	-27.809463
4	1.865390e+13	-22.064582
3	3.173240e+13	-16.791430
2	-1.000000e+00	-13.621775
1	1.015400e+13	-8.590054
0	2.489620e+13	-0.755797