Cosmic-ray flux vs. sky position


In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,scipy,pandas,sklearn,mlxtend


last updated: 2018-08-22 

CPython 2.7.13
IPython 5.7.0

numpy 1.14.5
scipy 1.1.0
pandas 0.23.1
sklearn 0.19.1
mlxtend 0.12.0

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from __future__ import division, print_function
import os
import sys
from numbers import Number
import numpy as np
import pandas as pd
import healpy as hp
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
import dask
from dask import delayed, compute
from dask.diagnostics import ProgressBar
import dask.array as da
import pyunfold

import comptools as comp
import sky_anisotropy as sa

color_dict = comp.color_dict

sns.set_context(context='paper', font_scale=1.5)

%matplotlib inline

In [4]:
config = 'IC86.2012'
num_groups = 2

comp_list = comp.get_comp_list(num_groups=num_groups)
analysis_bins = comp.get_bins(config=config, num_groups=num_groups)
energybins = comp.get_energybins(config)
num_ebins = len(energybins.log_energy_midpoints)

unfolding_dir  = os.path.join(comp.paths.comp_data_dir,
                              config,
                              'unfolding')

In [5]:
feature_list, feature_labels = comp.get_training_features()

In [6]:
energy_pipeline_name = 'linearregression'
# energy_pipeline_name = 'RF'
energy_pipeline = comp.load_trained_model('{}_energy_{}'.format(energy_pipeline_name, config))

In [7]:
# pipeline_str = 'SGD_comp_{}_{}-groups'.format(config, num_groups)
pipeline_str = 'xgboost_comp_{}_{}-groups'.format(config, num_groups)
comp_pipeline = comp.load_trained_model(pipeline_str)

In [8]:
# print('Loading data into memory...')
# df_data = comp.load_data(config=config,
#                          energy_reco=True,
#                          log_energy_min=6.1,
#                          log_energy_max=8.0,
#                          columns=feature_list + ['lap_ra', 'lap_dec'],
#                          n_jobs=20,
#                          verbose=True)

In [9]:
# df_data.to_hdf('data_dataframe.hdf', 'dataframe', format='table')

In [10]:
df_data = pd.read_hdf('data_dataframe.hdf', 'dataframe', mode='r')

In [11]:
print('Running energy reconstruction...')
df_data['reco_log_energy'] = energy_pipeline.predict(df_data[feature_list].values)
df_data['reco_energy'] = 10**df_data['reco_log_energy']


Running energy reconstruction...

In [12]:
import dask.array as da

X_data = da.from_array(df_data[feature_list].values, chunks=int(1e4))
X_data


Out[12]:
dask.array<array, shape=(6960257, 3), dtype=float64, chunksize=(10000, 3)>

In [13]:
from dask_ml.wrappers import ParallelPostFit
from dask.diagnostics import ProgressBar

pred_comp_target = ParallelPostFit(comp_pipeline).predict(X_data)
reco_log_energy = ParallelPostFit(energy_pipeline).predict(X_data)

In [14]:
import warnings

with ProgressBar() as _, warnings.catch_warnings() as _:
    warnings.simplefilter("ignore") # Wan to ignore xgboost DeprecationWarning
    print('Running composition classifications...')
#     df_data['pred_comp_target'] = pred_comp_target.compute(scheduler='threads', 
#                                                            num_workers=20)
    df_data['pred_comp_target'] = pred_comp_target.compute(scheduler='sync', num_workers=1)


Running composition classifications...
[########################################] | 100% Completed | 28.3s
[########################################] | 100% Completed |  0.1s

Cosmic-ray flux vs. sky position

Sample on/off regions


In [15]:
import sky_anisotropy as sa
from scipy import stats
from scipy.special import erfcinv

In [16]:
nside = 64
npix = hp.nside2npix(nside)

Spectrum anisotropy skymap


In [19]:
def unfolding_func(counts, composition='total'):
    original_shape = counts.shape
    counts_err = np.sqrt(counts)

    counts_total = counts.sum(axis=1)
    counts_err_total = np.sqrt(np.sum(counts_err**2, axis=1))

    unfolding_energy_range_mask = np.logical_and(energybins.log_energy_midpoints >= 6.4,
                                                 energybins.log_energy_midpoints <= 7.8)


    return counts_total[unfolding_energy_range_mask], counts_err_total[unfolding_energy_range_mask]

In [20]:
import pyprind

In [22]:
sig_max = []
n_samples = 20
for idx in pyprind.prog_bar(range(n_samples)):
    random_state = idx
    ra = df_data.loc[:, 'lap_ra'].sample(frac=1.0, random_state=random_state).values
    dec = df_data.loc[:, 'lap_dec'].values

    theta, phi = comp.equatorial_to_healpy(ra, dec)

    pix_array = hp.ang2pix(nside, theta, phi)
    df_data['pix'] = pix_array

    theta, phi = hp.pix2ang(nside, list(range(npix)))
    ra, dec = sa.healpy_to_equatorial(theta, phi)

    dec_max_deg = -65
    size = np.deg2rad(5)
    on_region = 'disc'
    off_region = 'theta_band'
    with_unfolding = False

    has_data = dec < np.deg2rad(dec_max_deg)
    if off_region == 'theta_band':
        has_data = has_data & (dec > np.deg2rad(-90) + size)

    pix_disc = np.arange(npix)[has_data]

    data = df_data.loc[:, ['reco_log_energy', 'pred_comp_target']].values
    pix = df_data.loc[:, 'pix'].values

    binned_skymaps = sa.binned_skymaps(data=data,
                                       pix=pix,
                                       bins=analysis_bins,
                                       nside=nside)

    with dask.config.set(scheduler='sync', num_workers=1):   
        results = sa.on_off_chi_squared(binned_maps=binned_skymaps,
                                        pix_center=pix_disc,
                                        on_region=on_region,
                                        size=size,
                                        off_region=off_region,
                                        nside=nside,
                                        hist_func=unfolding_func,
                                        )
    
    dof = 13
    pval = stats.chi2.sf(results['chi2'], dof)
    sig = erfcinv(2 * pval) * np.sqrt(2)
    sig_max.append(sig.max())


0% [####################] 100% | ETA: 00:00:00
Total time elapsed: 00:06:53

In [23]:
sig_max


Out[23]:
[3.829216883459315,
 3.2139315434755837,
 4.795585355272075,
 3.575490052790196,
 4.112490934464494,
 3.6194334010442084,
 4.019609610687242,
 3.4147847006173735,
 4.1031648482459415,
 3.280313229534489,
 4.017651885414452,
 3.7290561287029416,
 3.630777053945339,
 4.5771151205854155,
 6.244553706498697,
 3.76678918664466,
 3.695129605106175,
 3.2147816760650576,
 3.769250686723711,
 3.7661486902705428]

In [24]:
sig_max = np.array(sig_max)

In [25]:
sig_max


Out[25]:
array([3.82921688, 3.21393154, 4.79558536, 3.57549005, 4.11249093,
       3.6194334 , 4.01960961, 3.4147847 , 4.10316485, 3.28031323,
       4.01765189, 3.72905613, 3.63077705, 4.57711512, 6.24455371,
       3.76678919, 3.69512961, 3.21478168, 3.76925069, 3.76614869])

In [77]:
outdir = os.path.join(os.getcwd(),
                      'results',
                      'unfolded' if with_unfolding else 'pre-unfolding')
print('outdir = {}'.format(outdir))


outdir = /home/jbourbeau/cr-composition/spectrum-anisotropy/results/pre-unfolding

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [33]:
def calculate_local_sigma(df, nside=64, bins=None, random_state=None):
    
    if bins is None:
        raise ValueError('bins cannot be None')
        
    if random_state is None:
        ra = df.loc[:, 'lap_ra'].values
    else:
        ra = df.loc[:, 'lap_ra'].sample(frac=1.0, random_state=random_state).values
    dec = df.loc[:, 'lap_dec'].values

    theta, phi = comp.equatorial_to_healpy(ra, dec)
    pix_array = hp.ang2pix(nside, theta, phi)
    df['pix'] = pix_array
    
    npix = hp.nside2npix(nside)
    map_pix = np.arange(npix)
    theta, phi = hp.pix2ang(nside, map_pix)
    ra, dec = sa.healpy_to_equatorial(theta, phi)

    dec_max_deg = -65
    size = np.deg2rad(5)
    on_region = 'disc'
    off_region = 'theta_band'
    with_unfolding = False

    has_data = dec < np.deg2rad(dec_max_deg)
    if off_region == 'theta_band':
        has_data = has_data & (dec > np.deg2rad(-90) + size)

    pix_disc = map_pix[has_data]

    data = df.loc[:, ['reco_log_energy', 'pred_comp_target']].values
    pix = df.loc[:, 'pix'].values

    binned_skymaps = sa.binned_skymaps(data=data,
                                       pix=pix,
                                       bins=bins,
                                       nside=nside)

    with dask.config.set(scheduler='sync', num_workers=1):   
        results = sa.on_off_chi_squared(binned_maps=binned_skymaps,
                                        pix_center=pix_disc,
                                        on_region=on_region,
                                        size=size,
                                        off_region=off_region,
                                        nside=nside,
                                        hist_func=unfolding_func,
                                        )
    
    dof = 13
    pval = stats.chi2.sf(results['chi2'], dof)
    sig = erfcinv(2 * pval) * np.sqrt(2)
    
    return sig.max()

In [34]:
calculate_local_sigma(df=df_data, nside=nside, bins=analysis_bins, random_state=3)


Out[34]:
3.575490052790196

In [31]:
sig_max = np.array([calculate_local_sigma(df=df_data, nside=nside, bins=analysis_bins, random_state=i) for i in range(2)])

In [32]:
sig_max


Out[32]:
array([3.82921688, 3.21393154])

In [ ]: