In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [3]:
from __future__ import division
from collections import defaultdict
import os
import argparse
import numpy as np
import pandas as pd
# import xarray as xr
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import comptools as comp
color_dict = comp.get_color_dict()
%matplotlib inline
In [4]:
config = 'IC86.2012'
sims = comp.simfunctions.config_to_sim(config)
sigmoid = 'slant'
num_groups = 2
comp_list = comp.get_comp_list(num_groups=num_groups)
In [5]:
energybins = comp.get_energybins(config=config)
energy_comp_bins = comp.get_bins(config=config, num_groups=num_groups)
energy_comp_bins
Out[5]:
In [6]:
# Want to include energy bins for energies below the normal analysis energy
# range so we can get a better estimate of how the detector efficiencies turn on
low_energy_bins = np.arange(5.0, energybins.log_energy_min, 0.1)
ebins = np.concatenate((low_energy_bins, energybins.log_energy_bins))
ebin_midpoints = (ebins[1:] + ebins[:-1]) / 2
In [7]:
comp_bins = np.arange(num_groups + 1)
comp_bins
Out[7]:
In [8]:
bins = [ebins, comp_bins]
bins
Out[8]:
In [9]:
df_sim = comp.load_sim(config=config,
test_size=0,
log_energy_min=None,
log_energy_max=None,
verbose=True)
In [10]:
# Thrown areas are different for different energy bin
thrown_radii = comp.simfunctions.get_sim_thrown_radius(ebin_midpoints)
thrown_areas = np.pi * thrown_radii**2
thrown_areas_max = thrown_areas.max()
thrown_areas_max
Out[10]:
In [11]:
columns = ['MC_log_energy', 'comp_target_{}'.format(num_groups)]
passed_showers, _ = np.histogramdd(df_sim.loc[:, columns].values, bins=bins)
In [13]:
# coords = [ebin_midpoints, np.arange(num_groups)]
# dims = ['MC_log_energy', 'composition']
In [14]:
# passed_showers = xr.DataArray(passed_showers, coords=coords, dims=dims)
# passed_showers
In [12]:
def thrown_showers_per_ebin(sim_list, log_energy_bins=None):
"""Calculate the number of thrown showers in each energy bin
Parameters
----------
sim_list : array_like
Sequence of simulation dataset numbers.
log_energy_bins : array_like or None, optional
Log energy bins to use (defaults to np.arange(5, 8.1, 0.1)).
Returns
-------
thrown_showers : np.ndarray
Array containing the number of thrown showers in each energy bin.
"""
if isinstance(sim_list, int):
sim_list = [sim_list]
e_bins = []
for sim in sim_list:
file_iter = comp.simfunctions.get_level3_sim_files_iterator(sim)
runs = (sim_file_to_run(f) for f in file_iter)
for run in runs:
e_bin = comp.simfunctions.run_to_energy_bin(run, sim)
e_bins.append(e_bin)
if log_energy_bins is None:
log_energy_bins = np.arange(5, 8.1, 0.1)
vals, _ = np.histogram(e_bins, bins=log_energy_bins)
n_resamples = 100
n_showers_per_file = n_resamples
thrown_showers = vals * n_showers_per_file
return thrown_showers
In [13]:
def sim_file_to_run(file):
"""Extracts run number from a simulation file path
Parameters
----------
file : str
Simulation file path.
Returns
-------
run : int
Run number for simulation file
Examples
--------
>>> file = '/data/ana/CosmicRay/IceTop_level3/sim/IC79/7241/Level3_IC79_7241_Run005347.i3.gz'
>>> sim_file_to_run(file)
5347
"""
start_idx = file.find('Run')
run = int(file[start_idx+3: start_idx+9])
return run
In [14]:
# Calculate efficiencies and effective areas for each composition group
thrown_showers = np.zeros_like(passed_showers)
for idx, composition in enumerate(comp_list):
# for composition in comp_list + ['total']:
compositions = df_sim['comp_group_{}'.format(num_groups)]
# Need list of simulation sets for composition to get number of thrown showers
comp_mask = compositions == composition
sim_list = df_sim.loc[comp_mask, 'sim'].unique()
thrown_showers[:, idx] = thrown_showers_per_ebin(sim_list, log_energy_bins=ebins)
thrown_showers
Out[14]:
In [15]:
# thrown_showers = xr.DataArray(thrown_showers, coords=coords, dims=dims)
# thrown_showers
In [16]:
thrown_radius_factor = thrown_areas / thrown_areas_max
thrown_radius_factor
Out[16]:
In [17]:
efficiency, efficiency_err = comp.ratio_error(num=passed_showers,
num_err=np.sqrt(passed_showers),
den=thrown_showers,
den_err=np.sqrt(thrown_showers),
nan_to_num=True)
# efficiency_err = xr.DataArray(efficiency_err, coords=coords, dims=dims)
efficiency = efficiency * thrown_radius_factor.reshape(-1, 1)
efficiency_err = efficiency_err * thrown_radius_factor.reshape(-1, 1)
In [18]:
efficiency
Out[18]:
In [19]:
efficiency_err
Out[19]:
In [20]:
fig, ax = plt.subplots()
for idx, composition in enumerate(comp_list):
eff = efficiency[:, idx]
eff_err = efficiency_err[:, idx]
ax.errorbar(ebin_midpoints, eff, yerr=eff_err, color=color_dict[composition], label=composition)
ax.set_xlabel('$\mathrm{\log_{10}(E_{true}/GeV)}$')
ax.set_ylabel('Effective area [$\mathrm{m^2}$]')
ax.grid()
plt.show()
In [21]:
def sigmoid_flat(log_energy, p0, p1, p2):
return p0 / (1 + np.exp(-p1*log_energy + p2))
def sigmoid_slant(log_energy, p0, p1, p2, p3):
'''Fit function for effective area vs. energy
Parameters
----------
log_energy : numpy.ndarray
Log energy values
'''
return (p0 + p3*log_energy) / (1 + np.exp(-p1*log_energy + p2))
In [22]:
# Fit sigmoid function to efficiency vs. energy distribution
fit_func = sigmoid_flat if sigmoid == 'flat' else sigmoid_slant
p0 = [7e4, 8.0, 50.0] if sigmoid == 'flat' else [7e4, 8.5, 50.0, 800]
In [23]:
energy_min_fit, energy_max_fit = 5.8, energybins.log_energy_max
fit_energy_range_mask = (ebin_midpoints < energy_max_fit) & (ebin_midpoints > energy_min_fit)
fit_energy_range_mask
Out[23]:
In [24]:
# # Fit sigmoid function to efficiency vs. energy distribution
# fit_func = sigmoid_flat if sigmoid == 'flat' else sigmoid_slant
# p0 = [7e4, 8.0, 50.0] if sigmoid == 'flat' else [7e4, 8.5, 50.0, 800]
# efficiencies_fit = {}
# energy_min_fit, energy_max_fit = 5.8, energybins.log_energy_max
# emidpoints_fitmask = np.logical_and(ebin_midpoints > energy_min_fit,
# ebin_midpoints < energy_max_fit)
# ebin_midpoints_fit = ebin_midpoints[emidpoints_fitmask]
# ebin_midpoints_fit
In [25]:
# Find best-fit sigmoid function
# efficiency_fit = {}
efficiency_fit = np.empty_like(efficiency[fit_energy_range_mask])
# efficiency_fit = xr.zeros_like(efficiency[fit_energy_range_mask])
for idx, composition in enumerate(comp_list):
popt, pcov = curve_fit(fit_func,
ebin_midpoints[fit_energy_range_mask],
efficiency[fit_energy_range_mask, idx],
sigma=efficiency_err[fit_energy_range_mask, idx],
p0=p0)
eff_fit = fit_func(ebin_midpoints[fit_energy_range_mask], *popt)
# eff_fit = fit_func(ebin_midpoints[emidpoints_fitmask], *popt)
efficiency_fit[:, idx] = eff_fit
chi2 = np.nansum((efficiency[fit_energy_range_mask, idx] - eff_fit)**2 / (efficiency_err[fit_energy_range_mask, idx]) ** 2)
ndof = len(eff_fit) - len(p0)
# print('({}) chi2 / ndof = {} / {} = {}'.format(composition,
# chi2,
# ndof,
# chi2 / ndof))
efficiency_fit
Out[25]:
In [50]:
# Perform many fits to random statistical fluxuations of the best fit efficiency
# This will be used to estimate the uncertainty in the best fit efficiency
eff_fit = np.empty_like(efficiency_fit)
eff_fit_err = np.empty_like(eff_fit)
# efficiencies_fit_samples = defaultdict(list)
n_samples = 1000
for comp_idx, composition in enumerate(comp_list):
eff_fit_samples = []
for _ in range(n_samples):
# Get new random sample to fit
eff_sample = np.random.normal(efficiency_fit[:, comp_idx],
# efficiency_err[bin_midpoints_mask, idx])
efficiency_err[fit_energy_range_mask, comp_idx])
# Fit with error bars
popt, pcov = curve_fit(fit_func,
ebin_midpoints[fit_energy_range_mask],
eff_sample,
p0=p0,
# sigma=efficiency_err[bin_midpoints_mask, idx])
sigma=efficiency_err[fit_energy_range_mask, comp_idx])
eff_fit_samples.append(fit_func(ebin_midpoints[fit_energy_range_mask], *popt))
# efficiencies_fit_samples[composition].append(eff_fit_sample)
eff_fit[:, comp_idx] = np.mean(eff_fit_samples, axis=0)
eff_fit_err[:, comp_idx] = np.std(eff_fit_samples, axis=0)
In [51]:
eff_fit
Out[51]:
In [52]:
eff_fit_err
Out[52]:
In [53]:
# # Calculate median and error of efficiency fits
# eff_fit = pd.DataFrame()
# # eff_fit = xr.zeros_like(efficiency_fit)
# for composition in comp_list:
# # comp_mask = eff_fit.coords['composition'] == 1
# # fit_median, fit_err_low, fit_err_high = np.percentile(efficiencies_fit_samples[composition],
# # (50, 16, 84),
# # axis=0)
# fit_mean = np.mean(efficiencies_fit_samples[composition], axis=0)
# fit_err = np.std(efficiencies_fit_samples[composition], axis=0)
# # fit_err_low = np.abs(fit_err_low - fit_median)
# # fit_err_high = np.abs(fit_err_high - fit_median)
# eff_fit['eff_mean_{}'.format(composition)] = fit_mean
# eff_fit['eff_err_{}'.format(composition)] = fit_err
# # eff_fit['eff_median_{}'.format(composition)] = fit_median
# # eff_fit['eff_err_low_{}'.format(composition)] = fit_err_low
# # eff_fit['eff_err_high_{}'.format(composition)] = fit_err_high
# eff_fit['ebin_midpoints'] = ebin_midpoints[fit_energy_range_mask]
In [54]:
fig, ax = plt.subplots()
for idx, composition in enumerate(comp_list):
eff = efficiency[:, idx]
eff_err = efficiency_err[:, idx]
ax.errorbar(ebin_midpoints, eff, yerr=eff_err, ls='None', color=color_dict[composition], label=composition)
# ax.plot(ebin_midpoints[emidpoints_fitmask], efficiency_fit[:, idx], marker='None', color=color_dict[composition], label=composition)
# ax.fill_between(eff_fit['ebin_midpoints'],
# eff_fit['eff_mean_{}'.format(composition)] + eff_fit['eff_err_{}'.format(composition)],
# eff_fit['eff_mean_{}'.format(composition)] - eff_fit['eff_err_{}'.format(composition)],
# # yerr=[eff_fit['eff_err_low_{}'.format(composition)], eff_fit['eff_err_high_{}'.format(composition)]],
# # marker='None',
# alpha=0.7,
# color=color_dict[composition])
ax.fill_between(ebin_midpoints[fit_energy_range_mask],
eff_fit[:, idx] + eff_fit_err[:, idx],
eff_fit[:, idx] - eff_fit_err[:, idx],
# yerr=[eff_fit['eff_err_low_{}'.format(composition)], eff_fit['eff_err_high_{}'.format(composition)]],
# marker='None',
alpha=0.7,
color=color_dict[composition])
ax.set_xlabel('$\mathrm{\log_{10}(E_{true}/GeV)}$')
ax.set_ylabel('Effective area [$\mathrm{m^2}$]')
ax.grid()
plt.show()
In [55]:
unfolding_energy_range_mask = np.logical_and(ebin_midpoints[fit_energy_range_mask] >= energybins.log_energy_min,
ebin_midpoints[fit_energy_range_mask] <= energybins.log_energy_max)
unfolding_energy_range_mask
Out[55]:
In [56]:
eff_fit[unfolding_energy_range_mask]
Out[56]:
In [57]:
eff_fit_err[unfolding_energy_range_mask]
Out[57]:
In [58]:
eff_fit_err[unfolding_energy_range_mask].reshape(-1).shape
Out[58]:
In [59]:
outfile = os.path.join(os.getcwd(), 'efficienies.npy')
comp.check_output_dir(outfile)
np.save(outfile, eff_fit[unfolding_energy_range_mask])
outfile = os.path.join(os.getcwd(), 'efficienies_err.npy')
comp.check_output_dir(outfile)
np.save(outfile, eff_fit_err[unfolding_energy_range_mask])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [58]:
# .reset_index(drop=True).to_hdf(eff_outfile, 'dataframe')
In [61]:
import xarray as xr
In [88]:
efficiency_xr = xr.DataArray(efficiency,
coords=[ebin_midpoints, np.arange(num_groups)], dims=['reco_log_energy', 'composition'],
name='efficiency',
attrs={'units': 'log10(E/GeV)'})
efficiency_xr
Out[88]:
In [89]:
efficiency_xr.values
Out[89]:
In [90]:
efficiency_xr.name
Out[90]:
In [91]:
efficiency_xr[:10, :]
Out[91]:
In [94]:
efficiency_xr.coords['reco_log_energy'] > 5.8
Out[94]:
In [96]:
efficiency_xr[efficiency_xr.coords['reco_log_energy'] > 5.8]
Out[96]:
In [ ]:
In [ ]: