In [1]:

    
import utils
import core
import os
import math
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib as plt
from utils.graphing import graph
import theano
from sampled import sampled
import pymc3 as pm
import theano.tensor as tt



In [2]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline



In [3]:

    
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:,.2f}'.format
plt.rcParams['figure.figsize'] = (16,12)



In [4]:

    
filepath = os.getcwd() + "/Data/07to1Redshift.csv"
test = core.FilteredFrame(filepath, percent_agn = ['BPT'], composite = False, include_upperlimit = True)
df = test.dataframe









    



Rows were removed based on health of: ['O_III', 'H_Beta', 'N_II', 'H_Alpha']
Data Frame info:
___________________
From file: /Users/christopherstewart/Documents/Research/Trump/AGNClassification/Data/07to1Redshift.csv 

Data Frame Size: (42239, 99)
___________________



In [6]:

    
columns = ['BPT:P(AGN)',
           'O_III','O_III_Sigma','O_III_Health',
           'N_II','N_II_Sigma','N_II_Health',
           'H_Beta','H_Beta_Sigma','H_Beta_Health',
           'H_Alpha','H_Alpha_Sigma','H_Alpha_Health',
          ]
lim_columns = ['BPT:P(AGN)','O_III_Health','N_II_Health', 'H_Beta_Health', 'H_Alpha_Health']
d_limits = {}
dd_limits = {}
oiii_ratio_lowLim = df['O_III_Health'].isin(['Limit']) & df['H_Beta_Health'].isin(['Healthy']) 
oiii_ratio_upLim = df['O_III_Health'].isin(['Healthy']) & df['H_Beta_Health'].isin(['Limit'])
nii_ratio_lowLim = df['N_II_Health'].isin(['Limit']) & df['H_Alpha_Health'].isin(['Healthy'])
nii_ratio_upLim = df['N_II_Health'].isin(['Healthy']) & df['H_Alpha_Health'].isin(['Limit'])
oiii_healthy = df['O_III_Health'].isin(['Healthy']) & df['H_Beta_Health'].isin(['Healthy']) 
nii_healthy = df['N_II_Health'].isin(['Healthy']) & df['H_Alpha_Health'].isin(['Healthy']) 

oiii_limit = df[(nii_healthy & oiii_ratio_lowLim)]
nii_limit = df[(oiii_healthy & nii_ratio_lowLim)]
beta_limit = df[(nii_healthy & oiii_ratio_upLim)]
alpha_limit = df[(oiii_healthy & nii_ratio_upLim)]
d_limits['OIII'] = oiii_limit.shape[0]
d_limits['NII'] = nii_limit.shape[0]
d_limits['HBeta'] = beta_limit.shape[0]
d_limits['HAlpha'] = alpha_limit.shape[0]

o_n_limit = df[oiii_ratio_lowLim & nii_ratio_lowLim]
o_a_limit = df[oiii_ratio_lowLim & nii_ratio_upLim]
b_n_limit = df[oiii_ratio_upLim & nii_ratio_lowLim]
b_a_limit = df[oiii_ratio_upLim & nii_ratio_upLim]
dd_limits['OIII_NII'] = o_n_limit.shape[0]
dd_limits['OIII_Alpha'] = o_a_limit.shape[0]
dd_limits['Beta_NII'] = b_n_limit.shape[0]
dd_limits['Beta_Alpha'] = b_a_limit.shape[0]
print(d_limits)
print(dd_limits)









    



{'OIII': 1152, 'NII': 468, 'HBeta': 1360, 'HAlpha': 76}
{'OIII_NII': 199, 'OIII_Alpha': 29, 'Beta_NII': 204, 'Beta_Alpha': 49}

Bayesian Inference of Censored SDSS data

By: Christopher Stewart

Overview

Introduce the problem
- Data Selection
- Issues with traditional approach
- Introduce the BPT
- Limit Cases
Handling Limits
- Creating the model
- Example Case
- Results
What next

Intro - Data Selection

Primary SQL quarry

Selected from redshift range of 0.07 to 0.1
error above zero for following lines
- $\text{N}_{[\text{II}]}$
- $\text{O}_{[\text{III}]}$
- $\text{He}_{[\text{II}]}$
- $\text{H}_\alpha$
- $\text{H}_\beta$

Total Objects: 42239

Sigma Filtering

Objects were removed from the survey following the condition:

$0 <\text{flux_err} < 1\times e^{-15}$

Intro - Issues with traditional approach



In [7]:

    
graph.BPT(df)

No limits in any ratio



In [8]:

    
no_limits = df[(oiii_healthy & nii_healthy)]
graph.BPT(no_limits)

How many object we lose if we only take well defined objects



In [9]:

    
print('Number of Objects\n-------------\nAll Data: {}\nWell Def: {}'.format(df.shape[0],no_limits.shape[0]))
print('Loss Percentage: {0:.3f}%'.format((1-no_limits.shape[0]/df.shape[0])*100))









    



Number of Objects
-------------
All Data: 42239
Well Def: 38191
Loss Percentage: 9.584%

Lose around 9.6% of our objects

Introduce the BPT

Each object on the BPT hase two ratios which are made up of two fluxes with individual sigma values.

For example a single object has the following data:

$\log_{10}(\frac{\text{O}_{\text{III}}}{\text{H}_{\beta}}) \rightarrow \left[ \begin{array}{ll} \text{O}_{\text{flux}} & \text{O}_{{\sigma}} \\ \text{H}\beta_{\text{flux}} & \text{H}\beta_{{\sigma}} \\ \end{array} \right]$

$\log_{10}(\frac{\text{N}_{\text{II}}}{\text{H}_{\alpha}}) \rightarrow \left[ \begin{array}{ll} \text{N}_{\text{flux}} & \text{N}_{{\sigma}} \\ \text{H}\alpha_{\text{flux}} & \text{H}\alpha_{{\sigma}} \\ \end{array} \right]$



In [10]:

    
print('OIII upLim {}\nOIII lowLim: {}\nNII upLim: {}\nNII lowLim: {}'.format(df[oiii_ratio_upLim & nii_healthy].shape[0],
                                                                            df[oiii_ratio_lowLim & nii_healthy].shape[0],
                                                                            df[nii_ratio_upLim & oiii_healthy].shape[0],
                                                                            df[nii_ratio_lowLim & oiii_healthy].shape[0]))









    



OIII upLim 1360
OIII lowLim: 1152
NII upLim: 76
NII lowLim: 468

Limit cases

Limit:

$\text{flux} < \text{flux error}$

Lower Limit:

Let $f = \frac{A}{B} \text{ where } B < \sigma_B \text{, so } f = \frac{A}{\sigma_B} \text{ is an lower limit.}$

Upper Limit:

Let $f = \frac{A}{B} \text{ where } A < \sigma_A \text{, so } f = \frac{\sigma_A}{B} \text{ is an upper limit.}$

1D Limit

Is a limit in only one ratio. For example a lower limit in x would have the following shape,

2D Limit

Is a limit in each ratio. For example a lower limit in x and y would have the following shape,

Number of Limit cases

1D Limits:



In [11]:

    
print(d_limits)









    



{'OIII': 1152, 'NII': 468, 'HBeta': 1360, 'HAlpha': 76}

2D Limits:



In [12]:

    
print(dd_limits)









    



{'OIII_NII': 199, 'OIII_Alpha': 29, 'Beta_NII': 204, 'Beta_Alpha': 49}

Handling Limits

Creating the model

Example Case:

Object has a limit in OIII so this is an upper limit in our y



In [54]:

    
test_obj = df.iloc[[21226]]
test_obj[['Log(O_III/H_Beta)','Log(O_III/H_Beta)_Sigma','Log(N_II/H_Alpha)','Log(N_II/H_Alpha)_Sigma', 'O_III/H_Beta', 'H_Beta', 'H_Beta_Sigma', 'O_III']]









    Out[54]:







  
    
      
      Log(O_III/H_Beta)
      Log(O_III/H_Beta)_Sigma
      Log(N_II/H_Alpha)
      Log(N_II/H_Alpha)_Sigma
      O_III/H_Beta
      H_Beta
      H_Beta_Sigma
      O_III
    
  
  
    
      21226
      -0.37
      0.14
      -0.31
      0.03
      0.43
      4.05
      1.86
      1.74



In [15]:

    
graph.BPT(test_obj)
test_obj[lim_columns]









    












    Out[15]:







  
    
      
      BPT:P(AGN)
      O_III_Health
      N_II_Health
      H_Beta_Health
      H_Alpha_Health
    
  
  
    
      21226
      0.76
      Limit
      Healthy
      Healthy
      Healthy

Creating model in pymc3



In [16]:

    
x_test_data = [test_obj.loc[21226,'Log(N_II/H_Alpha)'],test_obj.loc[21226,'Log(N_II/H_Alpha)_Sigma'],
               test_obj.loc[21226,'N_II'],test_obj.loc[21226,'N_II_Sigma'],
               test_obj.loc[21226,'H_Alpha'],test_obj.loc[21226,'H_Alpha_Sigma']
              ]
y_test_data = [test_obj.loc[21226,'Log(O_III/H_Beta)'],test_obj.loc[21226,'Log(O_III/H_Beta)_Sigma'],
               test_obj.loc[21226,'O_III'],test_obj.loc[21226,'O_III_Sigma'],
               test_obj.loc[21226,'H_Beta'],test_obj.loc[21226,'H_Beta_Sigma']
              ]



In [17]:

    
@sampled
def oiii_limit(xData,yData):
    x,x_sigma,x_a,x_a_sigma,x_b,x_b_sigma = xData
    y,y_sigma,y_a,y_a_sigma,y_b,y_b_sigma = yData
    
    model_x_a = pm.Normal('N_II', mu=x_a, sd = x_a_sigma)
    model_x_b = pm.Normal('H_Alpha', mu=x_b, sd = x_b_sigma)
    model_y_b = pm.Normal('H_Beta', mu=y_b, sd = y_b_sigma)
    #the upper bound of 0.94 comes from solving ratio OIII/HBeta
    model_y_a_limit = pm.Uniform('OIII_Uniform',lower =0 ,upper=0.9417)
    model_y_a_gauss = pm.Normal('OIII_Normal',mu = 0.9417, sd = y_a_sigma)
    y_use_limit = pm.Binomial('OIII_swap', n=1,p=0.5)
    model_y_a = pm.Normal('OIII', mu = tt.switch(y_use_limit,model_y_a_limit,model_y_a_gauss), sd =1)
    
    model_NII_HAlpha = pm.Deterministic('NII/HAlpha', model_x_a/model_x_b)
    model_OIII_HBeta = pm.Deterministic('OIII/HBeta', model_y_a/model_y_b)

Run model



In [38]:

    
with oiii_limit(xData = x_test_data,yData =y_test_data):
    base_trace = pm.sample(10000)









    



Assigned NUTS to N_II
Assigned NUTS to H_Alpha
Assigned NUTS to H_Beta
Assigned NUTS to OIII_Uniform_interval__
Assigned NUTS to OIII_Normal
Assigned Metropolis to OIII_swap
Assigned NUTS to OIII
100%|██████████| 10500/10500 [00:52<00:00, 201.58it/s]

OIII in depth



In [37]:

    
pm.traceplot(base_trace, varnames = ['OIII_Normal','OIII_Uniform'])









    Out[37]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1c1cf67780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c1cef31d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1c1cedf6d8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c1ceca0b8>]], dtype=object)



In [26]:

    
def plot_jointkde(x_pdf,y_pdf):
    def StarForming(x): return (1.3 + 0.61 / (x - 0.04))
    g = sns.jointplot(np.log10(x_pdf),np.log10(y_pdf),kind = 'kde',size=10)
    xSpan = np.linspace(-0.6, -0.1, num=200)
    g.ax_joint.plot(xSpan, StarForming(xSpan), 'k--',-0.31,-0.37, 'bo')
    g.set_axis_labels('Log(N_II/H_Aplha)','Log(O[III]/H_Beta)')

Our 2D KDE

With our original point plotted as a blue dot



In [29]:

    
plot_jointkde(x_pdf= base_trace['NII/HAlpha'],y_pdf =base_trace['OIII/HBeta'])



In [34]:

    
pm.summary(base_trace)









    Out[34]:







  
    
      
      mean
      sd
      mc_error
      hpd_2.5
      hpd_97.5
      n_eff
      Rhat
    
  
  
    
      N_II
      12.03
      1.57
      0.01
      8.95
      15.04
      20,000.00
      1.00
    
    
      H_Alpha
      24.53
      1.71
      0.01
      21.17
      27.81
      20,000.00
      1.00
    
    
      H_Beta
      4.05
      1.87
      0.01
      0.28
      7.63
      20,000.00
      1.00
    
    
      OIII_Normal
      0.92
      1.87
      0.01
      -2.67
      4.68
      18,819.00
      1.00
    
    
      OIII_swap
      0.49
      0.50
      0.01
      0.00
      1.00
      2,593.00
      1.00
    
    
      OIII
      0.70
      1.70
      0.01
      -2.50
      4.56
      13,655.00
      1.00
    
    
      OIII_Uniform
      0.47
      0.27
      0.00
      0.01
      0.90
      20,000.00
      1.00
    
    
      NII/HAlpha
      0.49
      0.07
      0.00
      0.35
      0.64
      20,000.00
      1.00
    
    
      OIII/HBeta
      0.21
      4.26
      0.03
      -1.24
      1.82
      20,000.00
      1.00

Point location Result

Old:

$$\log_{10}(\frac{\text{O}_{\text{III}}}{\text{H}_{\beta}}) = -0.37$$

$$\log_{10}(\frac{\text{N}_{\text{II}}}{\text{H}_{\alpha}}) = -0.31 $$

New:

$$\log_{10}(\frac{\text{O}_{\text{III}}}{\text{H}_{\beta}}) = -0.67$$

$$\log_{10}(\frac{\text{N}_{\text{II}}}{\text{H}_{\alpha}}) = -0.31$$



In [51]:

    
def test_percentAGN(x,x_sig,y,y_sig):
    def AGNLine(x): return 1.19 + 0.61 / (x - 0.47)
    def SFLine(x): return 1.3 + 0.61 / (x - 0.04)
    lineCorrectionAGN = 0.2
    lineCorrectionSF = -0.2

    x0_array = np.float64(np.ones((100,)) * -x)
    y0_array = np.float64(np.ones((100,)) * -y)
    xLineSpace = np.linspace((x - x_sig), (x + x_sig), 100)
    yLineSpace = np.linspace((y - y_sig), (y + y_sig),
                             100)[:, None]
    ellipse = ((xLineSpace - x) / x_sig)**2 + ((yLineSpace - y) / y_sig)**2 <= 1
    boolean_AGN_line = np.subtract(
        yLineSpace, AGNLine(xLineSpace)) > 0  # AGN fit line
    boolean_AGN_Correction = (np.subtract(xLineSpace, np.ones(
        (100, 100)) * lineCorrectionAGN) > 0)  # Fixes problem with the fit line
    # combines fit line and correction into single boolean array
    boolean_AGN_Mask = np.where(np.logical_or(
        boolean_AGN_line, boolean_AGN_Correction), True, False)
    
    # plt.imshow(boolean_AGN_Mask, origin = 'lower') #for debugging

    # Starforming Fit Lines
    boolean_SF_line = np.subtract(yLineSpace, SFLine(
        xLineSpace)) < 0  # Starforming fit line
    boolean_SF_Correction = (np.subtract(xLineSpace, np.ones(
        (100, 100)) * lineCorrectionSF) < 0)  # Fixes problem with the fit line
    # combines fit line and correction into single boolean array
    boolean_SF_Mask = np.where(np.logical_and(
        boolean_SF_line, boolean_SF_Correction), True, False)

    # Neither mask indicates areas that are neither AGN or SF
    boolean_Neither_Mask = np.where(np.logical_or(
        boolean_SF_Mask, boolean_AGN_Mask), False, True)
    # plt.imshow(boolean_SF_Mask, origin = 'lower') #for debugging

    # Creat probability distribution function based on array index locations
    xWeight = (np.exp(-(np.add(xLineSpace, x0_array)**2) *
                      (2 * x_sig**2)) * (2 * math.pi * x_sig**2))**2
    yWeight = (np.exp(-(np.add(yLineSpace, y0_array)**2) *
                      (2 * y_sig**2)) * (2 * math.pi * y_sig**2))**2
    weight_Matrix = np.sqrt(xWeight + yWeight)

    # AGN total weight is the sum of weight matrix elements that are true in both the ellipse boolean mask and agn boolean mask
    AGN_Weight = np.sum(
        weight_Matrix[np.logical_and(ellipse, boolean_AGN_Mask)])
    SF_Weight = np.sum(weight_Matrix[np.logical_and(ellipse, boolean_SF_Mask)])
    Neither_Weight = np.sum(
        weight_Matrix[np.logical_and(ellipse, boolean_Neither_Mask)])
    # total weight of whole ellipse
    Total_Weight = np.sum(weight_Matrix[ellipse])

    # Percenbt AGN or starforming
    AGN_Percent = AGN_Weight / Total_Weight
    SF_Percent = SF_Weight / Total_Weight
    Neither_Percent = Neither_Weight / Total_Weight
    AGN_Percent += Neither_Percent
    
    return AGN_Percent

P(AGN) Result

Old:



In [60]:

    
test_obj[['BPT:P(AGN)']]

New:



In [56]:

    
test_percentAGN(x=-0.31,x_sig=0.03,y=-0.67,y_sig=np.log10(4.26))









    Out[56]:





0.27108921259943347

What next

Figure out 2D limit case
Automate this process
Filter out simple cases



In [ ]:

	mean	sd	mc_error	hpd_2.5	hpd_97.5	n_eff	Rhat
N_II	12.03	1.57	0.01	8.95	15.04	20,000.00	1.00
H_Alpha	24.53	1.71	0.01	21.17	27.81	20,000.00	1.00
H_Beta	4.05	1.87	0.01	0.28	7.63	20,000.00	1.00
OIII_Normal	0.92	1.87	0.01	-2.67	4.68	18,819.00	1.00
OIII_swap	0.49	0.50	0.01	0.00	1.00	2,593.00	1.00
OIII	0.70	1.70	0.01	-2.50	4.56	13,655.00	1.00
OIII_Uniform	0.47	0.27	0.00	0.01	0.90	20,000.00	1.00
NII/HAlpha	0.49	0.07	0.00	0.35	0.64	20,000.00	1.00
OIII/HBeta	0.21	4.26	0.03	-1.24	1.82	20,000.00	1.00