In [1]:
import utils
import core
import os
import math
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib as plt
from utils.graphing import graph
import theano
from sampled import sampled
import pymc3 as pm
import theano.tensor as tt

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:,.2f}'.format
plt.rcParams['figure.figsize'] = (16,12)

In [4]:
filepath = os.getcwd() + "/Data/07to1Redshift.csv"
test = core.FilteredFrame(filepath, percent_agn = ['BPT'], composite = False, include_upperlimit = True)
df = test.dataframe


Rows were removed based on health of: ['O_III', 'H_Beta', 'N_II', 'H_Alpha']
Data Frame info:
___________________
From file: /Users/christopherstewart/Documents/Research/Trump/AGNClassification/Data/07to1Redshift.csv 

Data Frame Size: (42239, 99)
___________________

In [6]:
columns = ['BPT:P(AGN)',
           'O_III','O_III_Sigma','O_III_Health',
           'N_II','N_II_Sigma','N_II_Health',
           'H_Beta','H_Beta_Sigma','H_Beta_Health',
           'H_Alpha','H_Alpha_Sigma','H_Alpha_Health',
          ]
lim_columns = ['BPT:P(AGN)','O_III_Health','N_II_Health', 'H_Beta_Health', 'H_Alpha_Health']
d_limits = {}
dd_limits = {}
oiii_ratio_lowLim = df['O_III_Health'].isin(['Limit']) & df['H_Beta_Health'].isin(['Healthy']) 
oiii_ratio_upLim = df['O_III_Health'].isin(['Healthy']) & df['H_Beta_Health'].isin(['Limit'])
nii_ratio_lowLim = df['N_II_Health'].isin(['Limit']) & df['H_Alpha_Health'].isin(['Healthy'])
nii_ratio_upLim = df['N_II_Health'].isin(['Healthy']) & df['H_Alpha_Health'].isin(['Limit'])
oiii_healthy = df['O_III_Health'].isin(['Healthy']) & df['H_Beta_Health'].isin(['Healthy']) 
nii_healthy = df['N_II_Health'].isin(['Healthy']) & df['H_Alpha_Health'].isin(['Healthy']) 

oiii_limit = df[(nii_healthy & oiii_ratio_lowLim)]
nii_limit = df[(oiii_healthy & nii_ratio_lowLim)]
beta_limit = df[(nii_healthy & oiii_ratio_upLim)]
alpha_limit = df[(oiii_healthy & nii_ratio_upLim)]
d_limits['OIII'] = oiii_limit.shape[0]
d_limits['NII'] = nii_limit.shape[0]
d_limits['HBeta'] = beta_limit.shape[0]
d_limits['HAlpha'] = alpha_limit.shape[0]

o_n_limit = df[oiii_ratio_lowLim & nii_ratio_lowLim]
o_a_limit = df[oiii_ratio_lowLim & nii_ratio_upLim]
b_n_limit = df[oiii_ratio_upLim & nii_ratio_lowLim]
b_a_limit = df[oiii_ratio_upLim & nii_ratio_upLim]
dd_limits['OIII_NII'] = o_n_limit.shape[0]
dd_limits['OIII_Alpha'] = o_a_limit.shape[0]
dd_limits['Beta_NII'] = b_n_limit.shape[0]
dd_limits['Beta_Alpha'] = b_a_limit.shape[0]
print(d_limits)
print(dd_limits)


{'OIII': 1152, 'NII': 468, 'HBeta': 1360, 'HAlpha': 76}
{'OIII_NII': 199, 'OIII_Alpha': 29, 'Beta_NII': 204, 'Beta_Alpha': 49}

Bayesian Inference of Censored SDSS data


By: Christopher Stewart

Overview


  1. Introduce the problem
    • Data Selection
    • Issues with traditional approach
    • Introduce the BPT
    • Limit Cases
  2. Handling Limits
    • Creating the model
    • Example Case
    • Results
  3. What next

Intro - Data Selection


Primary SQL quarry

  • Selected from redshift range of 0.07 to 0.1
  • error above zero for following lines
    • $\text{N}_{[\text{II}]}$
    • $\text{O}_{[\text{III}]}$
    • $\text{He}_{[\text{II}]}$
    • $\text{H}_\alpha$
    • $\text{H}_\beta$

Total Objects: 42239

Sigma Filtering

Objects were removed from the survey following the condition:

$0 <\text{flux_err} < 1\times e^{-15}$

Intro - Issues with traditional approach



In [7]:
graph.BPT(df)


No limits in any ratio


In [8]:
no_limits = df[(oiii_healthy & nii_healthy)]
graph.BPT(no_limits)


How many object we lose if we only take well defined objects


In [9]:
print('Number of Objects\n-------------\nAll Data: {}\nWell Def: {}'.format(df.shape[0],no_limits.shape[0]))
print('Loss Percentage: {0:.3f}%'.format((1-no_limits.shape[0]/df.shape[0])*100))


Number of Objects
-------------
All Data: 42239
Well Def: 38191
Loss Percentage: 9.584%

Lose around 9.6% of our objects

Introduce the BPT


Each object on the BPT hase two ratios which are made up of two fluxes with individual sigma values.

For example a single object has the following data:

$\log_{10}(\frac{\text{O}_{\text{III}}}{\text{H}_{\beta}}) \rightarrow \left[ \begin{array}{ll} \text{O}_{\text{flux}} & \text{O}_{{\sigma}} \\ \text{H}\beta_{\text{flux}} & \text{H}\beta_{{\sigma}} \\ \end{array} \right]$

$\log_{10}(\frac{\text{N}_{\text{II}}}{\text{H}_{\alpha}}) \rightarrow \left[ \begin{array}{ll} \text{N}_{\text{flux}} & \text{N}_{{\sigma}} \\ \text{H}\alpha_{\text{flux}} & \text{H}\alpha_{{\sigma}} \\ \end{array} \right]$


In [10]:
print('OIII upLim {}\nOIII lowLim: {}\nNII upLim: {}\nNII lowLim: {}'.format(df[oiii_ratio_upLim & nii_healthy].shape[0],
                                                                            df[oiii_ratio_lowLim & nii_healthy].shape[0],
                                                                            df[nii_ratio_upLim & oiii_healthy].shape[0],
                                                                            df[nii_ratio_lowLim & oiii_healthy].shape[0]))


OIII upLim 1360
OIII lowLim: 1152
NII upLim: 76
NII lowLim: 468

Limit cases


Limit:

$\text{flux} < \text{flux error}$

Lower Limit:

Let $f = \frac{A}{B} \text{ where } B < \sigma_B \text{, so } f = \frac{A}{\sigma_B} \text{ is an lower limit.}$

Upper Limit:

Let $f = \frac{A}{B} \text{ where } A < \sigma_A \text{, so } f = \frac{\sigma_A}{B} \text{ is an upper limit.}$

1D Limit

Is a limit in only one ratio. For example a lower limit in x would have the following shape,

2D Limit

Is a limit in each ratio. For example a lower limit in x and y would have the following shape,

Number of Limit cases


1D Limits:


In [11]:
print(d_limits)


{'OIII': 1152, 'NII': 468, 'HBeta': 1360, 'HAlpha': 76}

2D Limits:


In [12]:
print(dd_limits)


{'OIII_NII': 199, 'OIII_Alpha': 29, 'Beta_NII': 204, 'Beta_Alpha': 49}

Handling Limits


Creating the model

Example Case:

Object has a limit in OIII so this is an upper limit in our y


In [54]:
test_obj = df.iloc[[21226]]
test_obj[['Log(O_III/H_Beta)','Log(O_III/H_Beta)_Sigma','Log(N_II/H_Alpha)','Log(N_II/H_Alpha)_Sigma', 'O_III/H_Beta', 'H_Beta', 'H_Beta_Sigma', 'O_III']]


Out[54]:
Log(O_III/H_Beta) Log(O_III/H_Beta)_Sigma Log(N_II/H_Alpha) Log(N_II/H_Alpha)_Sigma O_III/H_Beta H_Beta H_Beta_Sigma O_III
21226 -0.37 0.14 -0.31 0.03 0.43 4.05 1.86 1.74

In [15]:
graph.BPT(test_obj)
test_obj[lim_columns]


Out[15]:
BPT:P(AGN) O_III_Health N_II_Health H_Beta_Health H_Alpha_Health
21226 0.76 Limit Healthy Healthy Healthy

Creating model in pymc3


In [16]:
x_test_data = [test_obj.loc[21226,'Log(N_II/H_Alpha)'],test_obj.loc[21226,'Log(N_II/H_Alpha)_Sigma'],
               test_obj.loc[21226,'N_II'],test_obj.loc[21226,'N_II_Sigma'],
               test_obj.loc[21226,'H_Alpha'],test_obj.loc[21226,'H_Alpha_Sigma']
              ]
y_test_data = [test_obj.loc[21226,'Log(O_III/H_Beta)'],test_obj.loc[21226,'Log(O_III/H_Beta)_Sigma'],
               test_obj.loc[21226,'O_III'],test_obj.loc[21226,'O_III_Sigma'],
               test_obj.loc[21226,'H_Beta'],test_obj.loc[21226,'H_Beta_Sigma']
              ]

In [17]:
@sampled
def oiii_limit(xData,yData):
    x,x_sigma,x_a,x_a_sigma,x_b,x_b_sigma = xData
    y,y_sigma,y_a,y_a_sigma,y_b,y_b_sigma = yData
    
    model_x_a = pm.Normal('N_II', mu=x_a, sd = x_a_sigma)
    model_x_b = pm.Normal('H_Alpha', mu=x_b, sd = x_b_sigma)
    model_y_b = pm.Normal('H_Beta', mu=y_b, sd = y_b_sigma)
    #the upper bound of 0.94 comes from solving ratio OIII/HBeta
    model_y_a_limit = pm.Uniform('OIII_Uniform',lower =0 ,upper=0.9417)
    model_y_a_gauss = pm.Normal('OIII_Normal',mu = 0.9417, sd = y_a_sigma)
    y_use_limit = pm.Binomial('OIII_swap', n=1,p=0.5)
    model_y_a = pm.Normal('OIII', mu = tt.switch(y_use_limit,model_y_a_limit,model_y_a_gauss), sd =1)
    
    model_NII_HAlpha = pm.Deterministic('NII/HAlpha', model_x_a/model_x_b)
    model_OIII_HBeta = pm.Deterministic('OIII/HBeta', model_y_a/model_y_b)

Run model


In [38]:
with oiii_limit(xData = x_test_data,yData =y_test_data):
    base_trace = pm.sample(10000)


Assigned NUTS to N_II
Assigned NUTS to H_Alpha
Assigned NUTS to H_Beta
Assigned NUTS to OIII_Uniform_interval__
Assigned NUTS to OIII_Normal
Assigned Metropolis to OIII_swap
Assigned NUTS to OIII
100%|██████████| 10500/10500 [00:52<00:00, 201.58it/s]

OIII in depth


In [37]:
pm.traceplot(base_trace, varnames = ['OIII_Normal','OIII_Uniform'])


Out[37]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1c1cf67780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c1cef31d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1c1cedf6d8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1c1ceca0b8>]], dtype=object)

In [26]:
def plot_jointkde(x_pdf,y_pdf):
    def StarForming(x): return (1.3 + 0.61 / (x - 0.04))
    g = sns.jointplot(np.log10(x_pdf),np.log10(y_pdf),kind = 'kde',size=10)
    xSpan = np.linspace(-0.6, -0.1, num=200)
    g.ax_joint.plot(xSpan, StarForming(xSpan), 'k--',-0.31,-0.37, 'bo')
    g.set_axis_labels('Log(N_II/H_Aplha)','Log(O[III]/H_Beta)')

Our 2D KDE

With our original point plotted as a blue dot


In [29]:
plot_jointkde(x_pdf= base_trace['NII/HAlpha'],y_pdf =base_trace['OIII/HBeta'])



In [34]:
pm.summary(base_trace)


Out[34]:
mean sd mc_error hpd_2.5 hpd_97.5 n_eff Rhat
N_II 12.03 1.57 0.01 8.95 15.04 20,000.00 1.00
H_Alpha 24.53 1.71 0.01 21.17 27.81 20,000.00 1.00
H_Beta 4.05 1.87 0.01 0.28 7.63 20,000.00 1.00
OIII_Normal 0.92 1.87 0.01 -2.67 4.68 18,819.00 1.00
OIII_swap 0.49 0.50 0.01 0.00 1.00 2,593.00 1.00
OIII 0.70 1.70 0.01 -2.50 4.56 13,655.00 1.00
OIII_Uniform 0.47 0.27 0.00 0.01 0.90 20,000.00 1.00
NII/HAlpha 0.49 0.07 0.00 0.35 0.64 20,000.00 1.00
OIII/HBeta 0.21 4.26 0.03 -1.24 1.82 20,000.00 1.00

Point location Result


Old:

$$\log_{10}(\frac{\text{O}_{\text{III}}}{\text{H}_{\beta}}) = -0.37$$


$$\log_{10}(\frac{\text{N}_{\text{II}}}{\text{H}_{\alpha}}) = -0.31 $$

New:

$$\log_{10}(\frac{\text{O}_{\text{III}}}{\text{H}_{\beta}}) = -0.67$$

$$\log_{10}(\frac{\text{N}_{\text{II}}}{\text{H}_{\alpha}}) = -0.31$$


In [51]:
def test_percentAGN(x,x_sig,y,y_sig):
    def AGNLine(x): return 1.19 + 0.61 / (x - 0.47)
    def SFLine(x): return 1.3 + 0.61 / (x - 0.04)
    lineCorrectionAGN = 0.2
    lineCorrectionSF = -0.2

    x0_array = np.float64(np.ones((100,)) * -x)
    y0_array = np.float64(np.ones((100,)) * -y)
    xLineSpace = np.linspace((x - x_sig), (x + x_sig), 100)
    yLineSpace = np.linspace((y - y_sig), (y + y_sig),
                             100)[:, None]
    ellipse = ((xLineSpace - x) / x_sig)**2 + ((yLineSpace - y) / y_sig)**2 <= 1
    boolean_AGN_line = np.subtract(
        yLineSpace, AGNLine(xLineSpace)) > 0  # AGN fit line
    boolean_AGN_Correction = (np.subtract(xLineSpace, np.ones(
        (100, 100)) * lineCorrectionAGN) > 0)  # Fixes problem with the fit line
    # combines fit line and correction into single boolean array
    boolean_AGN_Mask = np.where(np.logical_or(
        boolean_AGN_line, boolean_AGN_Correction), True, False)
    
    # plt.imshow(boolean_AGN_Mask, origin = 'lower') #for debugging

    # Starforming Fit Lines
    boolean_SF_line = np.subtract(yLineSpace, SFLine(
        xLineSpace)) < 0  # Starforming fit line
    boolean_SF_Correction = (np.subtract(xLineSpace, np.ones(
        (100, 100)) * lineCorrectionSF) < 0)  # Fixes problem with the fit line
    # combines fit line and correction into single boolean array
    boolean_SF_Mask = np.where(np.logical_and(
        boolean_SF_line, boolean_SF_Correction), True, False)

    # Neither mask indicates areas that are neither AGN or SF
    boolean_Neither_Mask = np.where(np.logical_or(
        boolean_SF_Mask, boolean_AGN_Mask), False, True)
    # plt.imshow(boolean_SF_Mask, origin = 'lower') #for debugging

    # Creat probability distribution function based on array index locations
    xWeight = (np.exp(-(np.add(xLineSpace, x0_array)**2) *
                      (2 * x_sig**2)) * (2 * math.pi * x_sig**2))**2
    yWeight = (np.exp(-(np.add(yLineSpace, y0_array)**2) *
                      (2 * y_sig**2)) * (2 * math.pi * y_sig**2))**2
    weight_Matrix = np.sqrt(xWeight + yWeight)

    # AGN total weight is the sum of weight matrix elements that are true in both the ellipse boolean mask and agn boolean mask
    AGN_Weight = np.sum(
        weight_Matrix[np.logical_and(ellipse, boolean_AGN_Mask)])
    SF_Weight = np.sum(weight_Matrix[np.logical_and(ellipse, boolean_SF_Mask)])
    Neither_Weight = np.sum(
        weight_Matrix[np.logical_and(ellipse, boolean_Neither_Mask)])
    # total weight of whole ellipse
    Total_Weight = np.sum(weight_Matrix[ellipse])

    # Percenbt AGN or starforming
    AGN_Percent = AGN_Weight / Total_Weight
    SF_Percent = SF_Weight / Total_Weight
    Neither_Percent = Neither_Weight / Total_Weight
    AGN_Percent += Neither_Percent
    
    return AGN_Percent

P(AGN) Result


Old:


In [60]:
test_obj[['BPT:P(AGN)']]


Out[60]:
BPT:P(AGN)
21226 0.76

New:


In [56]:
test_percentAGN(x=-0.31,x_sig=0.03,y=-0.67,y_sig=np.log10(4.26))


Out[56]:
0.27108921259943347

What next


  • Figure out 2D limit case
  • Automate this process
  • Filter out simple cases

In [ ]: