In [1]:
    
%load_ext watermark
%watermark -u -d -v -p numpy,scipy,pandas,sklearn,mlxtend
    
    
In [2]:
    
%load_ext autoreload
%autoreload 2
    
In [3]:
    
from __future__ import division, print_function
import os
import sys
from numbers import Number
import numpy as np
import pandas as pd
import healpy as hp
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
import dask
from dask import delayed, compute
from dask.diagnostics import ProgressBar
import dask.array as da
import pyunfold
import comptools as comp
import sky_anisotropy as sa
color_dict = comp.color_dict
sns.set_context(context='paper', font_scale=1.5)
%matplotlib inline
    
In [4]:
    
config = 'IC86.2012'
num_groups = 2
comp_list = comp.get_comp_list(num_groups=num_groups)
analysis_bins = comp.get_bins(config=config, num_groups=num_groups)
energybins = comp.get_energybins(config)
num_ebins = len(energybins.log_energy_midpoints)
unfolding_dir  = os.path.join(comp.paths.comp_data_dir,
                              config,
                              'unfolding')
    
In [5]:
    
feature_list, feature_labels = comp.get_training_features()
    
In [6]:
    
energy_pipeline_name = 'linearregression'
# energy_pipeline_name = 'RF'
energy_pipeline = comp.load_trained_model('{}_energy_{}'.format(energy_pipeline_name, config))
    
In [7]:
    
# pipeline_str = 'SGD_comp_{}_{}-groups'.format(config, num_groups)
pipeline_str = 'xgboost_comp_{}_{}-groups'.format(config, num_groups)
comp_pipeline = comp.load_trained_model(pipeline_str)
    
In [8]:
    
# print('Loading data into memory...')
# df_data = comp.load_data(config=config,
#                          energy_reco=True,
#                          log_energy_min=6.1,
#                          log_energy_max=8.0,
#                          columns=feature_list + ['lap_ra', 'lap_dec'],
#                          n_jobs=20,
#                          verbose=True)
    
In [9]:
    
# df_data.to_hdf('data_dataframe.hdf', 'dataframe', format='table')
    
In [10]:
    
df_data = pd.read_hdf('data_dataframe.hdf', 'dataframe', mode='r')
    
In [11]:
    
print('Running energy reconstruction...')
df_data['reco_log_energy'] = energy_pipeline.predict(df_data[feature_list].values)
df_data['reco_energy'] = 10**df_data['reco_log_energy']
    
    
In [12]:
    
import dask.array as da
X_data = da.from_array(df_data[feature_list].values, chunks=int(1e4))
X_data
    
    Out[12]:
In [13]:
    
from dask_ml.wrappers import ParallelPostFit
from dask.diagnostics import ProgressBar
pred_comp_target = ParallelPostFit(comp_pipeline).predict(X_data)
reco_log_energy = ParallelPostFit(energy_pipeline).predict(X_data)
    
In [14]:
    
import warnings
with ProgressBar() as _, warnings.catch_warnings() as _:
    warnings.simplefilter("ignore") # Wan to ignore xgboost DeprecationWarning
    print('Running composition classifications...')
#     df_data['pred_comp_target'] = pred_comp_target.compute(scheduler='threads', 
#                                                            num_workers=20)
    df_data['pred_comp_target'] = pred_comp_target.compute(scheduler='sync', num_workers=1)
    
    
In [15]:
    
import sky_anisotropy as sa
from scipy import stats
from scipy.special import erfcinv
    
In [16]:
    
nside = 64
npix = hp.nside2npix(nside)
    
In [19]:
    
def unfolding_func(counts, composition='total'):
    original_shape = counts.shape
    counts_err = np.sqrt(counts)
    counts_total = counts.sum(axis=1)
    counts_err_total = np.sqrt(np.sum(counts_err**2, axis=1))
    unfolding_energy_range_mask = np.logical_and(energybins.log_energy_midpoints >= 6.4,
                                                 energybins.log_energy_midpoints <= 7.8)
    return counts_total[unfolding_energy_range_mask], counts_err_total[unfolding_energy_range_mask]
    
In [20]:
    
import pyprind
    
In [22]:
    
sig_max = []
n_samples = 20
for idx in pyprind.prog_bar(range(n_samples)):
    random_state = idx
    ra = df_data.loc[:, 'lap_ra'].sample(frac=1.0, random_state=random_state).values
    dec = df_data.loc[:, 'lap_dec'].values
    theta, phi = comp.equatorial_to_healpy(ra, dec)
    pix_array = hp.ang2pix(nside, theta, phi)
    df_data['pix'] = pix_array
    theta, phi = hp.pix2ang(nside, list(range(npix)))
    ra, dec = sa.healpy_to_equatorial(theta, phi)
    dec_max_deg = -65
    size = np.deg2rad(5)
    on_region = 'disc'
    off_region = 'theta_band'
    with_unfolding = False
    has_data = dec < np.deg2rad(dec_max_deg)
    if off_region == 'theta_band':
        has_data = has_data & (dec > np.deg2rad(-90) + size)
    pix_disc = np.arange(npix)[has_data]
    data = df_data.loc[:, ['reco_log_energy', 'pred_comp_target']].values
    pix = df_data.loc[:, 'pix'].values
    binned_skymaps = sa.binned_skymaps(data=data,
                                       pix=pix,
                                       bins=analysis_bins,
                                       nside=nside)
    with dask.config.set(scheduler='sync', num_workers=1):   
        results = sa.on_off_chi_squared(binned_maps=binned_skymaps,
                                        pix_center=pix_disc,
                                        on_region=on_region,
                                        size=size,
                                        off_region=off_region,
                                        nside=nside,
                                        hist_func=unfolding_func,
                                        )
    
    dof = 13
    pval = stats.chi2.sf(results['chi2'], dof)
    sig = erfcinv(2 * pval) * np.sqrt(2)
    sig_max.append(sig.max())
    
    
In [23]:
    
sig_max
    
    Out[23]:
In [24]:
    
sig_max = np.array(sig_max)
    
In [25]:
    
sig_max
    
    Out[25]:
In [77]:
    
outdir = os.path.join(os.getcwd(),
                      'results',
                      'unfolded' if with_unfolding else 'pre-unfolding')
print('outdir = {}'.format(outdir))
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [33]:
    
def calculate_local_sigma(df, nside=64, bins=None, random_state=None):
    
    if bins is None:
        raise ValueError('bins cannot be None')
        
    if random_state is None:
        ra = df.loc[:, 'lap_ra'].values
    else:
        ra = df.loc[:, 'lap_ra'].sample(frac=1.0, random_state=random_state).values
    dec = df.loc[:, 'lap_dec'].values
    theta, phi = comp.equatorial_to_healpy(ra, dec)
    pix_array = hp.ang2pix(nside, theta, phi)
    df['pix'] = pix_array
    
    npix = hp.nside2npix(nside)
    map_pix = np.arange(npix)
    theta, phi = hp.pix2ang(nside, map_pix)
    ra, dec = sa.healpy_to_equatorial(theta, phi)
    dec_max_deg = -65
    size = np.deg2rad(5)
    on_region = 'disc'
    off_region = 'theta_band'
    with_unfolding = False
    has_data = dec < np.deg2rad(dec_max_deg)
    if off_region == 'theta_band':
        has_data = has_data & (dec > np.deg2rad(-90) + size)
    pix_disc = map_pix[has_data]
    data = df.loc[:, ['reco_log_energy', 'pred_comp_target']].values
    pix = df.loc[:, 'pix'].values
    binned_skymaps = sa.binned_skymaps(data=data,
                                       pix=pix,
                                       bins=bins,
                                       nside=nside)
    with dask.config.set(scheduler='sync', num_workers=1):   
        results = sa.on_off_chi_squared(binned_maps=binned_skymaps,
                                        pix_center=pix_disc,
                                        on_region=on_region,
                                        size=size,
                                        off_region=off_region,
                                        nside=nside,
                                        hist_func=unfolding_func,
                                        )
    
    dof = 13
    pval = stats.chi2.sf(results['chi2'], dof)
    sig = erfcinv(2 * pval) * np.sqrt(2)
    
    return sig.max()
    
In [34]:
    
calculate_local_sigma(df=df_data, nside=nside, bins=analysis_bins, random_state=3)
    
    Out[34]:
In [31]:
    
sig_max = np.array([calculate_local_sigma(df=df_data, nside=nside, bins=analysis_bins, random_state=i) for i in range(2)])
    
In [32]:
    
sig_max
    
    Out[32]:
In [ ]: