In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,scipy,pandas,sklearn,mlxtend
In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
from __future__ import division, print_function
import os
import sys
from numbers import Number
import numpy as np
import pandas as pd
import healpy as hp
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
import dask
from dask import delayed, compute
from dask.diagnostics import ProgressBar
import dask.array as da
import pyunfold
import comptools as comp
import sky_anisotropy as sa
color_dict = comp.color_dict
sns.set_context(context='paper', font_scale=1.5)
%matplotlib inline
In [4]:
config = 'IC86.2012'
num_groups = 2
comp_list = comp.get_comp_list(num_groups=num_groups)
analysis_bins = comp.get_bins(config=config, num_groups=num_groups)
energybins = comp.get_energybins(config)
num_ebins = len(energybins.log_energy_midpoints)
unfolding_dir = os.path.join(comp.paths.comp_data_dir,
config,
'unfolding')
In [5]:
feature_list, feature_labels = comp.get_training_features()
In [6]:
energy_pipeline_name = 'linearregression'
# energy_pipeline_name = 'RF'
energy_pipeline = comp.load_trained_model('{}_energy_{}'.format(energy_pipeline_name, config))
In [7]:
# pipeline_str = 'SGD_comp_{}_{}-groups'.format(config, num_groups)
pipeline_str = 'xgboost_comp_{}_{}-groups'.format(config, num_groups)
comp_pipeline = comp.load_trained_model(pipeline_str)
In [8]:
# print('Loading data into memory...')
# df_data = comp.load_data(config=config,
# energy_reco=True,
# log_energy_min=6.1,
# log_energy_max=8.0,
# columns=feature_list + ['lap_ra', 'lap_dec'],
# n_jobs=20,
# verbose=True)
In [9]:
# df_data.to_hdf('data_dataframe.hdf', 'dataframe', format='table')
In [10]:
df_data = pd.read_hdf('data_dataframe.hdf', 'dataframe', mode='r')
In [11]:
print('Running energy reconstruction...')
df_data['reco_log_energy'] = energy_pipeline.predict(df_data[feature_list].values)
df_data['reco_energy'] = 10**df_data['reco_log_energy']
In [12]:
import dask.array as da
X_data = da.from_array(df_data[feature_list].values, chunks=int(1e4))
X_data
Out[12]:
In [13]:
from dask_ml.wrappers import ParallelPostFit
from dask.diagnostics import ProgressBar
pred_comp_target = ParallelPostFit(comp_pipeline).predict(X_data)
reco_log_energy = ParallelPostFit(energy_pipeline).predict(X_data)
In [14]:
import warnings
with ProgressBar() as _, warnings.catch_warnings() as _:
warnings.simplefilter("ignore") # Wan to ignore xgboost DeprecationWarning
print('Running composition classifications...')
# df_data['pred_comp_target'] = pred_comp_target.compute(scheduler='threads',
# num_workers=20)
df_data['pred_comp_target'] = pred_comp_target.compute(scheduler='sync', num_workers=1)
In [15]:
import sky_anisotropy as sa
from scipy import stats
from scipy.special import erfcinv
In [16]:
nside = 64
npix = hp.nside2npix(nside)
In [19]:
def unfolding_func(counts, composition='total'):
original_shape = counts.shape
counts_err = np.sqrt(counts)
counts_total = counts.sum(axis=1)
counts_err_total = np.sqrt(np.sum(counts_err**2, axis=1))
unfolding_energy_range_mask = np.logical_and(energybins.log_energy_midpoints >= 6.4,
energybins.log_energy_midpoints <= 7.8)
return counts_total[unfolding_energy_range_mask], counts_err_total[unfolding_energy_range_mask]
In [20]:
import pyprind
In [22]:
sig_max = []
n_samples = 20
for idx in pyprind.prog_bar(range(n_samples)):
random_state = idx
ra = df_data.loc[:, 'lap_ra'].sample(frac=1.0, random_state=random_state).values
dec = df_data.loc[:, 'lap_dec'].values
theta, phi = comp.equatorial_to_healpy(ra, dec)
pix_array = hp.ang2pix(nside, theta, phi)
df_data['pix'] = pix_array
theta, phi = hp.pix2ang(nside, list(range(npix)))
ra, dec = sa.healpy_to_equatorial(theta, phi)
dec_max_deg = -65
size = np.deg2rad(5)
on_region = 'disc'
off_region = 'theta_band'
with_unfolding = False
has_data = dec < np.deg2rad(dec_max_deg)
if off_region == 'theta_band':
has_data = has_data & (dec > np.deg2rad(-90) + size)
pix_disc = np.arange(npix)[has_data]
data = df_data.loc[:, ['reco_log_energy', 'pred_comp_target']].values
pix = df_data.loc[:, 'pix'].values
binned_skymaps = sa.binned_skymaps(data=data,
pix=pix,
bins=analysis_bins,
nside=nside)
with dask.config.set(scheduler='sync', num_workers=1):
results = sa.on_off_chi_squared(binned_maps=binned_skymaps,
pix_center=pix_disc,
on_region=on_region,
size=size,
off_region=off_region,
nside=nside,
hist_func=unfolding_func,
)
dof = 13
pval = stats.chi2.sf(results['chi2'], dof)
sig = erfcinv(2 * pval) * np.sqrt(2)
sig_max.append(sig.max())
In [23]:
sig_max
Out[23]:
In [24]:
sig_max = np.array(sig_max)
In [25]:
sig_max
Out[25]:
In [77]:
outdir = os.path.join(os.getcwd(),
'results',
'unfolded' if with_unfolding else 'pre-unfolding')
print('outdir = {}'.format(outdir))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [33]:
def calculate_local_sigma(df, nside=64, bins=None, random_state=None):
if bins is None:
raise ValueError('bins cannot be None')
if random_state is None:
ra = df.loc[:, 'lap_ra'].values
else:
ra = df.loc[:, 'lap_ra'].sample(frac=1.0, random_state=random_state).values
dec = df.loc[:, 'lap_dec'].values
theta, phi = comp.equatorial_to_healpy(ra, dec)
pix_array = hp.ang2pix(nside, theta, phi)
df['pix'] = pix_array
npix = hp.nside2npix(nside)
map_pix = np.arange(npix)
theta, phi = hp.pix2ang(nside, map_pix)
ra, dec = sa.healpy_to_equatorial(theta, phi)
dec_max_deg = -65
size = np.deg2rad(5)
on_region = 'disc'
off_region = 'theta_band'
with_unfolding = False
has_data = dec < np.deg2rad(dec_max_deg)
if off_region == 'theta_band':
has_data = has_data & (dec > np.deg2rad(-90) + size)
pix_disc = map_pix[has_data]
data = df.loc[:, ['reco_log_energy', 'pred_comp_target']].values
pix = df.loc[:, 'pix'].values
binned_skymaps = sa.binned_skymaps(data=data,
pix=pix,
bins=bins,
nside=nside)
with dask.config.set(scheduler='sync', num_workers=1):
results = sa.on_off_chi_squared(binned_maps=binned_skymaps,
pix_center=pix_disc,
on_region=on_region,
size=size,
off_region=off_region,
nside=nside,
hist_func=unfolding_func,
)
dof = 13
pval = stats.chi2.sf(results['chi2'], dof)
sig = erfcinv(2 * pval) * np.sqrt(2)
return sig.max()
In [34]:
calculate_local_sigma(df=df_data, nside=nside, bins=analysis_bins, random_state=3)
Out[34]:
In [31]:
sig_max = np.array([calculate_local_sigma(df=df_data, nside=nside, bins=analysis_bins, random_state=i) for i in range(2)])
In [32]:
sig_max
Out[32]:
In [ ]: