In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend


last updated: 2017-03-14 

CPython 2.7.10
IPython 5.3.0

numpy 1.12.0
matplotlib 2.0.0
scipy 0.15.1
pandas 0.19.2
sklearn 0.18.1
mlxtend 0.5.1

In [21]:
%matplotlib inline
from __future__ import division, print_function
from collections import defaultdict
import numpy as np
from scipy.sparse import block_diag
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import json

from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, KFold, StratifiedKFold

import composition as comp
import composition.analysis.plotting as plotting
    
color_dict = {'light': 'C0', 'heavy': 'C1', 'total': 'C2',
             'P': 'C0', 'He': 'C1', 'O': 'C3', 'Fe':'C4'}

Define analysis free parameters

[ back to top ]

Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions


In [4]:
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']

Get composition classifier pipeline


In [5]:
pipeline_str = 'GBDT'
pipeline = comp.get_pipeline(pipeline_str)

Define energy binning for this analysis


In [6]:
energybins = comp.analysis.get_energybins()

Data preprocessing

[ back to top ]

  1. Load simulation/data dataframe and apply specified quality cuts
  2. Extract desired features from dataframe
  3. Get separate testing and training datasets
  4. Feature transformation

In [7]:
sim_train, sim_test = comp.preprocess_sim(comp_class=comp_class, return_energy=True)


sim quality cut event flow:
             IceTopQualityCuts:    1.0    1.0
         lap_InIce_containment:  0.776  0.776
              InIceQualityCuts:  0.786   0.75
                 num_hits_1_60:  0.999   0.75


Selecting the following features:
	$\cos(\theta_{\mathrm{Lap}})$
	$\log_{10}(S_{\mathrm{125}})$
	$\log_{10}$(dE/dX)
	
Number training events = 208926
Number testing events = 89540

In [8]:
# Compute the correlation matrix
df_sim = comp.load_dataframe(datatype='sim', config='IC79')
feature_list, feature_labels = comp.analysis.get_training_features()
corr = df_sim[feature_list].corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots()
sns.heatmap(corr, mask=mask, cmap='RdBu_r', center=0,
            square=True, xticklabels=feature_labels, yticklabels=feature_labels,
            linewidths=.5, cbar_kws={'label': 'Covariance'}, annot=True, ax=ax)
# outfile = args.outdir + '/feature_covariance.png'
# plt.savefig(outfile)
plt.show()


sim quality cut event flow:
             IceTopQualityCuts:    1.0    1.0
         lap_InIce_containment:  0.776  0.776
              InIceQualityCuts:  0.786   0.75
                 num_hits_1_60:  0.999   0.75


/home/jbourbeau/.local/lib/python2.7/site-packages/seaborn/matrix.py:143: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if xticklabels == []:
/home/jbourbeau/.local/lib/python2.7/site-packages/seaborn/matrix.py:151: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if yticklabels == []:
/home/jbourbeau/.local/lib/python2.7/site-packages/matplotlib/figure.py:1742: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [9]:
data = comp.preprocess_data(comp_class=comp_class, return_energy=True)


data quality cut event flow:
             IceTopQualityCuts:    1.0    1.0
         lap_InIce_containment:    1.0    1.0
              InIceQualityCuts:    0.9    0.9
                 num_hits_1_60:    1.0    0.9


/home/jbourbeau/cr-composition/composition/dataframe_functions.py:137: RuntimeWarning: invalid value encountered in log10
  df['log_dEdX'] = np.log10(df['eloss_1500_standard'])
Selecting the following features:
	$\cos(\theta_{\mathrm{Lap}})$
	$\log_{10}(S_{\mathrm{125}})$
	$\log_{10}$(dE/dX)
	
Number testing events = 7212805

In [14]:
is_finite_mask = np.isfinite(data.X)
not_finite_mask = np.logical_not(is_finite_mask)
finite_data_mask = np.logical_not(np.any(not_finite_mask, axis=1))
data = data[finite_data_mask]

Run classifier over training and testing sets to get an idea of the degree of overfitting


In [19]:
clf_name = pipeline.named_steps['classifier'].__class__.__name__
print('=' * 30)
print(clf_name)
pipeline.fit(sim_train.X, sim_train.y)
train_pred = pipeline.predict(sim_train.X)
train_acc = accuracy_score(sim_train.y, train_pred)
print('Training accuracy = {:.2%}'.format(train_acc))
test_pred = pipeline.predict(sim_test.X)
test_acc = accuracy_score(sim_test.y, test_pred)
print('Testing accuracy = {:.2%}'.format(test_acc))
print('=' * 30)


==============================
XGBClassifier
Training accuracy = 77.79%
Testing accuracy = 77.58%
==============================

In [23]:
len(energybins.energy_midpoints)*2


Out[23]:
34

In [9]:
def get_frac_correct(train, test, pipeline, comp_list):
    
    assert isinstance(train, comp.analysis.DataSet), 'train dataset must be a DataSet'
    assert isinstance(test, comp.analysis.DataSet), 'test dataset must be a DataSet'
    assert train.y is not None, 'train must have true y values'
    assert test.y is not None, 'test must have true y values'
    
    pipeline.fit(train.X, train.y)
    test_predictions = pipeline.predict(test.X)
    correctly_identified_mask = (test_predictions == test.y)

    # Construct MC composition masks
    MC_comp_mask = {}
    for composition in comp_list:
        MC_comp_mask[composition] = (test.le.inverse_transform(test.y) == composition)
    MC_comp_mask['total'] = np.array([True]*len(test))
    
    reco_frac, reco_frac_err = {}, {}
    for composition in comp_list+['total']:
        comp_mask = MC_comp_mask[composition]
        # Get number of MC comp in each reco energy bin
        num_MC_energy = np.histogram(test.log_energy[comp_mask],
                                     bins=energybins.log_energy_bins)[0]
        num_MC_energy_err = np.sqrt(num_MC_energy)

        # Get number of correctly identified comp in each reco energy bin
        num_reco_energy = np.histogram(test.log_energy[comp_mask & correctly_identified_mask],
                                       bins=energybins.log_energy_bins)[0]
        num_reco_energy_err = np.sqrt(num_reco_energy)

        # Calculate correctly identified fractions as a function of MC energy
        reco_frac[composition], reco_frac_err[composition] = comp.ratio_error(
            num_reco_energy, num_reco_energy_err,
            num_MC_energy, num_MC_energy_err)
    
    return reco_frac, reco_frac_err

Calculate classifier generalization error via 10-fold CV


In [10]:
# Split training data into CV training and testing folds
kf = KFold(n_splits=10)
frac_correct_folds = defaultdict(list)
fold_num = 0
print('Fold ', end='')
for train_index, test_index in kf.split(sim_train.X):
    fold_num += 1
    print('{}...'.format(fold_num), end='')
    
    reco_frac, reco_frac_err = get_frac_correct(sim_train[train_index],
                                                sim_train[test_index],
                                                pipeline, comp_list)
        
    for composition in comp_list:
        frac_correct_folds[composition].append(reco_frac[composition])
    frac_correct_folds['total'].append(reco_frac['total'])
frac_correct_gen_err = {key: np.std(frac_correct_folds[key], axis=0) for key in frac_correct_folds}
scores = np.array(frac_correct_folds['total'])
score = scores.mean(axis=1).mean()
score_std = scores.mean(axis=1).std()


Fold 1...2...3...4...5...6...7...8...9...10...

In [11]:
reco_frac, reco_frac_stat_err = get_frac_correct(sim_train, sim_test, pipeline, comp_list)

In [12]:
# Plot fraction of events correctlt classified vs energy
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    err = np.sqrt(frac_correct_gen_err[composition]**2 + reco_frac_stat_err[composition]**2)
    plotting.plot_steps(energybins.log_energy_midpoints, reco_frac[composition], err, ax,
                        color_dict[composition], composition)
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max])
ax.grid()
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)

cv_str = 'Accuracy: {:0.2f}\% (+/- {:0.1f}\%)'.format(score*100, score_std*100)
ax.text(7.4, 0.2, cv_str,
        ha="center", va="center", size=10,
        bbox=dict(boxstyle='round', fc="white", ec="gray", lw=0.8))
plt.savefig('/home/jbourbeau/public_html/figures/frac-correct.png')
plt.show()


/home/jbourbeau/.local/lib/python2.7/site-packages/matplotlib/figure.py:1742: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

Spectrum

[ back to top ]


In [10]:
def get_num_comp_reco(train, test, pipeline, comp_list):
    
    assert isinstance(train, comp.analysis.DataSet), 'train dataset must be a DataSet'
    assert isinstance(test, comp.analysis.DataSet), 'test dataset must be a DataSet'
    assert train.y is not None, 'train must have true y values'
    
    pipeline.fit(train.X, train.y)
    test_predictions = pipeline.predict(test.X)

    # Get number of correctly identified comp in each reco energy bin
    num_reco_energy, num_reco_energy_err = {}, {}
    for composition in comp_list:
        print('composition = {}'.format(composition))
        comp_mask = train.le.inverse_transform(test_predictions) == composition
        print('sum(comp_mask) = {}'.format(np.sum(comp_mask)))
        print(test.log_energy[comp_mask])
        num_reco_energy[composition] = np.histogram(test.log_energy[comp_mask],
            bins=energybins.log_energy_bins)[0]
        num_reco_energy_err[composition] = np.sqrt(num_reco_energy[composition])

    num_reco_energy['total'] = np.histogram(test.log_energy, bins=energybins.log_energy_bins)[0]
    num_reco_energy_err['total'] = np.sqrt(num_reco_energy['total'])
    
    return num_reco_energy, num_reco_energy_err

In [15]:
# Get number of events per energy bin
num_reco_energy, num_reco_energy_err = get_num_comp_reco(sim_train, data, pipeline, comp_list)
import pprint
pprint.pprint(num_reco_energy)
# Solid angle
solid_angle = 2*np.pi*(1-np.cos(np.arccos(0.8)))


composition = light
sum(comp_mask) = 3789833
[ 6.084369    6.02108021  6.19372399 ...,  6.34287938  5.58828732
  6.30276075]
composition = heavy
sum(comp_mask) = 3422971
[ 6.09036488  6.04362939  6.22640939 ...,  6.23452475  6.24879068
  5.93383339]
{'heavy': array([333373, 247362, 171104, 111037,  71072,  46386,  31339,  20151,
        12939,   8385,   5905,   3940,   2625,   1823,   1105,    760,
          457,    351,    181,     96,     59,     34,     21,     17,
            6,      8,      4]),
 'light': array([359917, 246830, 166187, 111450,  71146,  43352,  24185,  13594,
         7618,   4381,   2489,   1435,    915,    486,    300,    168,
           81,     45,     26,     11,      5,      6,      3,      3,
            1,      2,      1]),
 'total': array([693290, 494192, 337291, 222487, 142218,  89738,  55524,  33745,
        20557,  12766,   8394,   5375,   3540,   2309,   1405,    928,
          538,    396,    207,    107,     64,     40,     24,     20,
            7,     10,      5])}

Number of events observed


In [16]:
counts_observed = []
for light_counts, heavy_counts in zip(num_reco_energy['light'], num_reco_energy['heavy']):
    counts_observed.extend([light_counts, heavy_counts])
print('counts_observed = {}'.format(counts_observed))


counts_observed = [359917, 333373, 246830, 247362, 166187, 171104, 111450, 111037, 71146, 71072, 43352, 46386, 24185, 31339, 13594, 20151, 7618, 12939, 4381, 8385, 2489, 5905, 1435, 3940, 915, 2625, 486, 1823, 300, 1105, 168, 760, 81, 457, 45, 351, 26, 181, 11, 96, 5, 59, 6, 34, 3, 21, 3, 17, 1, 6, 2, 8, 1, 4]

Block diagonal response matrix and error


In [17]:
pipeline.fit(sim_train.X, sim_train.y)
test_predictions = pipeline.predict(sim_test.X)
true_comp = sim_train.le.inverse_transform(sim_test.y)
pred_comp = sim_train.le.inverse_transform(test_predictions)

In [18]:
response_list = []
response_err_list = []
sim_bin_idxs = np.digitize(sim_test.log_energy, energybins.log_energy_bins) - 1
data_bin_idxs = np.digitize(data.log_energy, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(sim_bin_idxs)
# energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
for bin_idx in energy_bin_idx:
    sim_bin_mask = sim_bin_idxs == bin_idx
    data_bin_mask = data_bin_idxs == bin_idx
    response_mat = confusion_matrix(true_comp[sim_bin_mask], pred_comp[sim_bin_mask],
            labels=comp_list)
    # Transpose response matrix to get MC comp on x-axis and reco comp on y-axis
    response_mat = response_mat.T
    # Get response matrix statistical error
    response_mat_err = np.sqrt(response_mat)/response_mat.sum(axis=0, keepdims=True)
    response_err_list.append(response_mat_err)
    # Normalize along MC comp axis to go from counts to probabilities
    response_mat = response_mat.astype(float)/response_mat.sum(axis=0, keepdims=True)
    response_list.append(response_mat)
block_response = block_diag(response_list).toarray()
block_response = np.flipud(block_response)
print('block_response = \n{}'.format(block_response))
block_response_err = block_diag(response_err_list).toarray()
block_response_err = np.flipud(block_response_err)
print('block_response_err = \n{}'.format(block_response_err))


[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27]
block_response = 
[[ 0.          0.          0.         ...,  0.          0.34194529
   0.72911392]
 [ 0.          0.          0.         ...,  0.          0.65805471
   0.27088608]
 [ 0.          0.          0.         ...,  0.67821782  0.          0.        ]
 ..., 
 [ 0.          0.          0.7336594  ...,  0.          0.          0.        ]
 [ 0.27706331  0.65677749  0.         ...,  0.          0.          0.        ]
 [ 0.72293669  0.34322251  0.         ...,  0.          0.          0.        ]]
block_response_err = 
[[ 0.          0.          0.         ...,  0.          0.02279635
   0.03037975]
 [ 0.          0.          0.         ...,  0.          0.03162409
   0.01851739]
 [ 0.          0.          0.         ...,  0.05794406  0.          0.        ]
 ..., 
 [ 0.          0.          0.01806145 ...,  0.          0.          0.        ]
 [ 0.00442467  0.00748273  0.         ...,  0.          0.          0.        ]
 [ 0.00714729  0.00540927  0.         ...,  0.          0.          0.        ]]

Priors array


In [22]:
from icecube.weighting.weighting import from_simprod
from icecube.weighting.fluxes import GaisserH3a, GaisserH4a, Hoerandel5

In [23]:
simlist = np.unique(df_sim['sim'])
for i, sim in enumerate(simlist):
    _, sim_files = comp.simfunctions.get_level3_sim_files(sim)
    num_files = len(sim_files)
    if i == 0:
        generator = num_files*from_simprod(int(sim))
    else:
        generator += num_files*from_simprod(int(sim))

In [ ]:
flux = GaisserH3a()

In [97]:
df_sim[['MC_comp', 'MC_type']][df_sim.MC_comp == 'O']


Out[97]:
MC_comp MC_type
89387 O 1000080160
89388 O 1000080160
89389 O 1000080160
89390 O 1000080160
89391 O 1000080160
89392 O 1000080160
89393 O 1000080160
89394 O 1000080160
89395 O 1000080160
89396 O 1000080160
89397 O 1000080160
89398 O 1000080160
89399 O 1000080160
89400 O 1000080160
89401 O 1000080160
89402 O 1000080160
89403 O 1000080160
89404 O 1000080160
89406 O 1000080160
89421 O 1000080160
89422 O 1000080160
89423 O 1000080160
89424 O 1000080160
89426 O 1000080160
89428 O 1000080160
89431 O 1000080160
89432 O 1000080160
89433 O 1000080160
89454 O 1000080160
89456 O 1000080160
... ... ...
298184 O 1000080160
298185 O 1000080160
298188 O 1000080160
298189 O 1000080160
298190 O 1000080160
298192 O 1000080160
298194 O 1000080160
298195 O 1000080160
298230 O 1000080160
298232 O 1000080160
298234 O 1000080160
298235 O 1000080160
298236 O 1000080160
298237 O 1000080160
298238 O 1000080160
298241 O 1000080160
298242 O 1000080160
298243 O 1000080160
298244 O 1000080160
298245 O 1000080160
298247 O 1000080160
298248 O 1000080160
298249 O 1000080160
298251 O 1000080160
298253 O 1000080160
298257 O 1000080160
298258 O 1000080160
298260 O 1000080160
298261 O 1000080160
298262 O 1000080160

47678 rows × 2 columns


In [25]:
priors = defaultdict(list)
for flux, name in zip([GaisserH3a(), GaisserH4a(), Hoerandel5()], ['h3a', 'h4a', 'Hoerandel5']):
    for energy_mid in energybins.energy_midpoints:
        energy = [energy_mid]*4
        ptype = [2212, 1000020040, 1000080160, 1000260560]
        weights = flux(energy, ptype)/generator(energy, ptype)
    #     print(weights)
        light_prior = weights[:2].sum()/weights.sum()
        heavy_prior = weights[2:].sum()/weights.sum()
        priors[name].extend([light_prior, heavy_prior])

print(priors['h3a'])
print(priors['h4a'])


[0.79485424672400162, 0.2051457532759984, 0.78177507599322127, 0.21822492400677868, 0.76509234457320252, 0.23490765542679748, 0.74386528325718204, 0.25613471674281796, 0.71702794507974121, 0.28297205492025879, 0.68352718791259426, 0.31647281208740574, 0.64263221839977736, 0.35736778160022259, 0.59446276221066374, 0.40553723778933615, 0.54066937306006457, 0.45933062693993543, 0.4848140417529801, 0.51518595824701996, 0.43185878210312245, 0.56814121789687755, 0.38634654457343537, 0.61365345542656469, 0.35013964932575592, 0.64986035067424419, 0.32158482777117464, 0.6784151722288253, 0.29706559884125688, 0.70293440115874317, 0.27363011814444504, 0.72636988185555496, 0.25060502884488495, 0.74939497115511511, 0.22950895978329439, 0.77049104021670556, 0.2130490539309626, 0.78695094606903737, 0.20390734976915756, 0.79609265023084252, 0.20363846444133216, 0.79636153555866773, 0.21178653880906614, 0.78821346119093394, 0.22598373811501932, 0.77401626188498074, 0.24349899562679947, 0.75650100437320067, 0.2630930180037736, 0.7369069819962264, 0.28543971687283121, 0.71456028312716879, 0.31209084105950902, 0.68790915894049087]
[0.80050169680368843, 0.19949830319631159, 0.78801561325433145, 0.21198438674566844, 0.77210739006885387, 0.22789260993114607, 0.7518989129007978, 0.2481010870992022, 0.72640531499397443, 0.27359468500602546, 0.69467065254982474, 0.30532934745017537, 0.65606328905719191, 0.3439367109428082, 0.61076868965315778, 0.38923131034684227, 0.56040788759090743, 0.43959211240909263, 0.50835876997041884, 0.4916412300295811, 0.45924854414153404, 0.54075145585846596, 0.41726961601168183, 0.58273038398831822, 0.38413056738328449, 0.61586943261671556, 0.35832875411478404, 0.64167124588521596, 0.33659354061310415, 0.6634064593868958, 0.31629831532871305, 0.68370168467128689, 0.29692987413370198, 0.70307012586629791, 0.27998152366194523, 0.72001847633805482, 0.26798247834890793, 0.73201752165109213, 0.26335767706240926, 0.73664232293759069, 0.26741899932910101, 0.73258100067089893, 0.27965790706749749, 0.72034209293250251, 0.29804730541319208, 0.70195269458680798, 0.32072080999476466, 0.67927919000523529, 0.3478559826526415, 0.6521440173473585, 0.38210284985187809, 0.61789715014812197, 0.42760988697725738, 0.57239011302274267]

In [28]:
len(counts_observed)


Out[28]:
54

In [26]:
with open('pyunfold_dict.json', 'w') as outfile:
    data = {'counts': counts_observed,
            'block_response':block_response.tolist(),
            'block_response_err':block_response_err.tolist()}
    for model in ['h3a', 'h4a', 'Hoerandel5']:
        data['priors_{}'.format(model)] = priors[model]
    json.dump(data, outfile)

In [19]:
# Live-time information
goodrunlist = pd.read_table('/data/ana/CosmicRay/IceTop_GRL/IC79_2010_GoodRunInfo_4IceTop.txt', skiprows=[0, 3])
goodrunlist.head()


Out[19]:
RunNum Good_i3 Good_it Good_it_L2 LiveTime(s) ActiveStrings ActiveDoms ActiveInIceDoms OutDir Comment(s)
0 115978 1 1 0 26856 - - - /data/exp/IceCube/2010/filtered/level2a/0531 NaN
1 115982 1 1 1 28548 - - - /data/exp/IceCube/2010/filtered/level2a/0531 NaN
2 115984 1 1 1 6984 - - - /data/exp/IceCube/2010/filtered/level2a/0601 NaN
3 115985 1 1 1 10476 - - - /data/exp/IceCube/2010/filtered/level2a/0601 NaN
4 115986 1 1 1 28872 - - - /data/exp/IceCube/2010/filtered/level2a/0601 NaN

In [20]:
livetimes = goodrunlist['LiveTime(s)']
livetime = np.sum(livetimes[goodrunlist['Good_it_L2'] == 1])
print('livetime (seconds) = {}'.format(livetime))
print('livetime (days) = {}'.format(livetime/(24*60*60)))


livetime (seconds) = 27114012
livetime (days) = 313.819583333

In [21]:
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = num_reco_energy[composition]
    y_err = num_reco_energy_err[composition]
    # Add time duration
    y = y / livetime
    y_err = y / livetime
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err,
#                 color=color_dict[composition], label=composition,
#                 marker='.', linestyle='None')
    plotting.plot_steps(energybins.log_energy_midpoints, y, y_err,
                        ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Rate [s$^{-1}$]')
ax.set_xlim([6.2, 8.0])
# ax.set_ylim([10**2, 10**5])
ax.grid(linestyle=':')
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
plt.show()



In [22]:
eff_area, eff_area_error, energy_midpoints = comp.analysis.get_effective_area(df_sim,
                                                            energybins.energy_bins, energy='MC')
fix, ax = plt.subplots()
ax.errorbar(np.log10(energy_midpoints), eff_area, yerr=eff_area_error)
ax.set_ylabel('Effective area [$m^2$]')
ax.set_xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.grid()
plt.show()


Calculating effective area...
Simulation set 7006: 30000 files
Simulation set 7007: 30000 files
Simulation set 7241: 10000 files
Simulation set 7242: 10000 files
Simulation set 7262: 19999 files
Simulation set 7263: 19998 files
Simulation set 7579: 12000 files
Simulation set 7784: 12000 files
Simulation set 7791: 12000 files
Simulation set 7851: 12000 files

In [23]:
energybins.log_energy_bins


Out[23]:
array([ 6.4,  6.5,  6.6,  6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,
        7.5,  7.6,  7.7,  7.8,  7.9,  8. ])

In [24]:
# Plot fraction of events vs energy
# fig, ax = plt.subplots(figsize=(8, 6))
fig = plt.figure()
ax = plt.gca()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = num_reco_energy[composition]/energybins.energy_bin_widths
    y_err = num_reco_energy_err[composition]/energybins.energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / livetime
    y_err = y / livetime
    # Add energy scaling 
#     energy_err = get_energy_res(df_sim, energy_bins)
#     energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = energybins.energy_midpoints**2.7 * y
    y_err = energybins.energy_midpoints**2.7 * y_err
#     print(y)
#     print(y_err)
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
#            marker='.', markersize=8)
    plotting.plot_steps(energybins.log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
    
# Add 3-year scraped flux
df_proton = pd.read_csv('3yearscraped/proton', sep='\t', header=None, names=['energy', 'flux'])
df_helium = pd.read_csv('3yearscraped/helium', sep='\t', header=None, names=['energy', 'flux'])
df_light = pd.DataFrame.from_dict({'energy': df_proton.energy, 
                                  'flux': df_proton.flux + df_helium.flux})

df_oxygen = pd.read_csv('3yearscraped/oxygen', sep='\t', header=None, names=['energy', 'flux'])
df_iron = pd.read_csv('3yearscraped/iron', sep='\t', header=None, names=['energy', 'flux'])
df_heavy = pd.DataFrame.from_dict({'energy': df_oxygen.energy, 
                                  'flux': df_oxygen.flux + df_iron.flux})

if comp_class:
    ax.plot(np.log10(df_light.energy), df_light.flux, label='3 yr light',
            marker='.', ls=':')
    ax.plot(np.log10(df_heavy.energy), df_heavy.flux, label='3 yr heavy',
            marker='.', ls=':')
    ax.plot(np.log10(df_heavy.energy), df_heavy.flux+df_light.flux, label='3 yr total',
            marker='.', ls=':')
else:
    ax.plot(np.log10(df_proton.energy), df_proton.flux, label='3 yr proton',
            marker='.', ls=':')
    ax.plot(np.log10(df_helium.energy), df_helium.flux, label='3 yr helium',
            marker='.', ls=':', color=color_dict['He'])
    ax.plot(np.log10(df_oxygen.energy), df_oxygen.flux, label='3 yr oxygen',
            marker='.', ls=':', color=color_dict['O'])
    ax.plot(np.log10(df_iron.energy), df_iron.flux, label='3 yr iron',
        marker='.', ls=':', color=color_dict['Fe'])
    ax.plot(np.log10(df_iron.energy), df_proton.flux+df_helium.flux+df_oxygen.flux+df_iron.flux, label='3 yr total',
    marker='.', ls=':', color='C2')


leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.15),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)

plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()



In [24]:
if not comp_class:
    # Add 3-year scraped flux
    df_proton = pd.read_csv('3yearscraped/proton', sep='\t', header=None, names=['energy', 'flux'])
    df_helium = pd.read_csv('3yearscraped/helium', sep='\t', header=None, names=['energy', 'flux'])
    df_oxygen = pd.read_csv('3yearscraped/oxygen', sep='\t', header=None, names=['energy', 'flux'])
    df_iron = pd.read_csv('3yearscraped/iron', sep='\t', header=None, names=['energy', 'flux'])
    # Plot fraction of events vs energy
    fig, axarr = plt.subplots(2, 2, figsize=(8, 6))
    for composition, ax in zip(comp_list + ['total'], axarr.flatten()):
        # Calculate dN/dE
        y = num_reco_energy[composition]/energybins.energy_bin_widths
        y_err = num_reco_energy_err[composition]/energybins.energy_bin_widths
        # Add effective area
        y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
        # Add solid angle
        y = y / solid_angle
        y_err = y_err / solid_angle
        # Add time duration
        y = y / livetime
        y_err = y / livetime
        y = energybins.energy_midpoints**2.7 * y
        y_err = energybins.energy_midpoints**2.7 * y_err
        plotting.plot_steps(energybins.log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
        # Load 3-year flux
        df_3yr = pd.read_csv('3yearscraped/{}'.format(composition), sep='\t',
                             header=None, names=['energy', 'flux'])
        ax.plot(np.log10(df_3yr.energy), df_3yr.flux, label='3 yr {}'.format(composition),
                        marker='.', ls=':', color=color_dict[composition])
        ax.set_yscale("log", nonposy='clip')
        # ax.set_xscale("log", nonposy='clip')
        ax.set_xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
        ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
        ax.set_xlim([6.3, 8])
        ax.set_ylim([10**3, 10**5])
        ax.grid(linestyle='dotted', which="both")
        ax.legend()

    plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
    plt.show()

Unfolding

[ back to top ]


In [20]:
reco_frac['light']


Out[20]:
array([ 0.79794826,  0.76499774,  0.7639696 ,  0.79415557,  0.78565179,
        0.77673225,  0.77012048,  0.78905735,  0.75768668,  0.75121275,
        0.77295025,  0.78764205,  0.79578947,  0.81094527,  0.800269  ,
        0.75702247,  0.67834681])

In [21]:
reco_frac['heavy']


Out[21]:
array([ 0.6625555 ,  0.69889899,  0.7124831 ,  0.70190964,  0.68406072,
        0.71247655,  0.71406728,  0.66709677,  0.71340206,  0.71594203,
        0.71060172,  0.7338538 ,  0.73440785,  0.73953824,  0.74112426,
        0.79719298,  0.8142514 ])

In [22]:
num_reco_energy['light']


Out[22]:
array([420632, 276894, 181297, 122976,  76838,  47431,  27133,  15700,
         8685,   4821,   2978,   1670,    985,    529,    275,    145,
           72])

In [23]:
num_reco_energy['heavy']


Out[23]:
array([325377, 243322, 169803, 107060,  69443,  44516,  29576,  18673,
        12246,   8142,   5537,   3782,   2593,   1804,   1139,    795,
          469])

In [9]:
pipeline.fit(sim_train.X, sim_train.y)
test_predictions = pipeline.predict(sim_test.X)
true_comp = sim_train.le.inverse_transform(sim_test.y)
pred_comp = sim_train.le.inverse_transform(test_predictions)
print(true_comp)
print(pred_comp)


['light' 'heavy' 'heavy' ..., 'light' 'heavy' 'light']
['light' 'light' 'heavy' ..., 'light' 'heavy' 'light']

In [10]:
data_pred = pipeline.predict(data.X)
observed_comp = sim_train.le.inverse_transform(data_pred)
print(observed_comp)


['light' 'light' 'light' ..., 'light' 'light' 'light']

In [11]:
comp_list


Out[11]:
['light', 'heavy']

In [16]:
sim_bin_idxs = np.digitize(sim_test.log_energy, energybins.log_energy_bins) - 1
data_bin_idxs = np.digitize(data.log_energy, energybins.log_energy_bins) - 1
energy_bin_idx = np.unique(sim_bin_idxs)
energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
num_reco_energy_unfolded = defaultdict(list)
for bin_idx in energy_bin_idx:
    sim_bin_mask = sim_bin_idxs == bin_idx
    data_bin_mask = data_bin_idxs == bin_idx
    unfolder = comp.analysis.Unfolder()
    unfolded_events = unfolder.unfold(true_MC_comp=true_comp[sim_bin_mask],
                                        reco_MC_comp=pred_comp[sim_bin_mask],
                                        observed_comp=observed_comp[data_bin_mask],
                                        priors=[0.5, 0.5],
                                        labels=comp_list)
    print(unfolded_events)
    for i, composition in enumerate(comp_list):
        num_reco_energy_unfolded[composition].append(unfolded_events[i])

for composition in comp_list:
    num_reco_energy_unfolded[composition] = np.array(num_reco_energy_unfolded[composition])
num_reco_energy_unfolded['total'] = np.sum([num_reco_energy_unfolded[composition] for composition in comp_list], axis=0)
print(num_reco_energy_unfolded)


[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
242546 of label light
251603 of label heavy
[252034.97073310346, 242114.02926689654]
166316 of label light
170955 of label heavy
[172379.73369032133, 164891.26630967867]
109767 of label light
112713 of label heavy
[111615.96906515925, 110864.03093484075]
70235 of label light
71981 of label heavy
[70956.41855899457, 71259.58144100543]
42535 of label light
47202 of label heavy
[43823.056062721218, 45913.943937278782]
23757 of label light
31767 of label heavy
[25913.41237747125, 29610.58762252875]
13505 of label light
20240 of label heavy
[14987.041606214123, 18757.958393785877]
7546 of label light
13011 of label heavy
[8997.6993613439936, 11559.300638656006]
4350 of label light
8416 of label heavy
[5322.8364595714684, 7443.1635404285316]
2501 of label light
5893 of label heavy
[3228.194000041738, 5165.805999958262]
1407 of label light
3968 of label heavy
[1984.0936241934473, 3390.9063758065522]
889 of label light
2651 of label heavy
[1256.9501674307915, 2283.0498325692083]
455 of label light
1854 of label heavy
[735.36772465599006, 1573.6322753440099]
286 of label light
1119 of label heavy
[457.06934600699105, 947.93065399300906]
162 of label light
766 of label heavy
[263.84585345300104, 664.15414654699896]
68 of label light
470 of label heavy
[181.27541474817053, 356.72458525182947]
defaultdict(<type 'list'>, {'heavy': array([ 242114.0292669 ,  164891.26630968,  110864.03093484,
         71259.58144101,   45913.94393728,   29610.58762253,
         18757.95839379,   11559.30063866,    7443.16354043,
          5165.80599996,    3390.90637581,    2283.04983257,
          1573.63227534,     947.93065399,     664.15414655,
           356.72458525]), 'light': array([  2.52034971e+05,   1.72379734e+05,   1.11615969e+05,
         7.09564186e+04,   4.38230561e+04,   2.59134124e+04,
         1.49870416e+04,   8.99769936e+03,   5.32283646e+03,
         3.22819400e+03,   1.98409362e+03,   1.25695017e+03,
         7.35367725e+02,   4.57069346e+02,   2.63845853e+02,
         1.81275415e+02]), 'total': array([ 494149.,  337271.,  222480.,  142216.,   89737.,   55524.,
         33745.,   20557.,   12766.,    8394.,    5375.,    3540.,
          2309.,    1405.,     928.,     538.])})

In [27]:
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = num_reco_energy_unfolded[composition]/energy_bin_widths
    y_err = np.sqrt(y)/energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / livetime
    y_err = y / livetime
    # Add energy scaling 
#     energy_err = get_energy_res(df_sim, energy_bins)
#     energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = energy_midpoints**2.7 * y
    y_err = energy_midpoints**2.7 * y_err
    print(y)
    print(y_err)
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
#            marker='.', markersize=8)
    plotting.plot_steps(log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
# plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()


/home/jbourbeau/.local/lib/python2.7/site-packages/ipykernel/__main__.py:5: RuntimeWarning: invalid value encountered in sqrt
[ 32964.86466106  27281.60673729  26245.51936798  25800.46976282
  24367.94359577  21912.38305619  17567.46971826  13989.07475035
   9781.68750813   6500.43997616   5592.42691983   3776.25783731
   2593.30366087    948.219778     -677.31023813  -3104.77818648
  -7049.02624428]
[  1.21578705e-03   1.00618111e-03   9.67968863e-04   9.51554855e-04
   8.98721428e-04   8.08157165e-04   6.47911114e-04   5.15935257e-04
   3.60761348e-04   2.39744674e-04   2.06255973e-04   1.39273297e-04
   9.56444093e-05   3.49715777e-05  -2.49800818e-05  -1.14508254e-04
  -2.59977249e-04]
[ 18219.23378418  20563.84242876  22859.52829982  19412.19755384
  19585.14401337  19234.70677643  20980.14221497  20353.69336981
  20570.6878795   21639.99796678  22252.57050605  22657.63387946
  24102.30337491  25050.21456208  24030.72816826  22910.40547222
  23293.44720902]
[ 0.00067195  0.00075842  0.00084309  0.00071595  0.00072233  0.0007094
  0.00077377  0.00075067  0.00075867  0.00079811  0.0008207   0.00083564
  0.00088892  0.00092388  0.00088628  0.00084497  0.00085909]
[ 51184.09844524  47845.44916605  49105.0476678   45212.66731665
  43953.08760914  41147.08983262  38547.61193322  34342.76812016
  30352.37538763  28140.43794294  27844.99742587  26433.89171677
  26695.60703578  25998.43434008  23353.41793013  19805.62728574
  16244.42096474]
[ 0.00188774  0.0017646   0.00181106  0.0016675   0.00162105  0.00151756
  0.00142169  0.00126661  0.00111944  0.00103786  0.00102696  0.00097492
  0.00098457  0.00095886  0.0008613   0.00073046  0.00059912]

Iterative method

Get confusion matrix for each energy bin


In [99]:
bin_idxs = np.digitize(energy_test_sim, log_energy_bins) - 1
energy_bin_idx = np.unique(bin_idxs)
energy_bin_idx = energy_bin_idx[1:]
print(energy_bin_idx)
num_reco_energy_unfolded = defaultdict(list)
response_mat = []
for bin_idx in energy_bin_idx:
    energy_bin_mask = bin_idxs == bin_idx
    confmat = confusion_matrix(true_comp[energy_bin_mask], pred_comp[energy_bin_mask], labels=comp_list)
    confmat = np.divide(confmat.T, confmat.sum(axis=1, dtype=float)).T
    response_mat.append(confmat)


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]

In [100]:
response_mat


Out[100]:
[array([[ 0.78813559,  0.21186441],
        [ 0.31475086,  0.68524914]]), array([[ 0.7609382 ,  0.2390618 ],
        [ 0.28674007,  0.71325993]]), array([[ 0.7688869 ,  0.2311131 ],
        [ 0.28886886,  0.71113114]]), array([[ 0.7945853 ,  0.2054147 ],
        [ 0.28737774,  0.71262226]]), array([[ 0.77821522,  0.22178478],
        [ 0.29981025,  0.70018975]]), array([[ 0.78485885,  0.21514115],
        [ 0.27861163,  0.72138837]]), array([[ 0.7826506 ,  0.2173494 ],
        [ 0.27828746,  0.72171254]]), array([[ 0.7976269 ,  0.2023731 ],
        [ 0.31419355,  0.68580645]]), array([[ 0.77818448,  0.22181552],
        [ 0.27147766,  0.72852234]]), array([[ 0.77269577,  0.22730423],
        [ 0.26884058,  0.73115942]]), array([[ 0.79117029,  0.20882971],
        [ 0.2786533 ,  0.7213467 ]]), array([[ 0.80965909,  0.19034091],
        [ 0.24698368,  0.75301632]]), array([[ 0.82175439,  0.17824561],
        [ 0.25508059,  0.74491941]]), array([[ 0.8336887 ,  0.1663113 ],
        [ 0.25829726,  0.74170274]]), array([[ 0.83120377,  0.16879623],
        [ 0.25887574,  0.74112426]]), array([[ 0.8005618 ,  0.1994382 ],
        [ 0.22245614,  0.77754386]]), array([[ 0.76010782,  0.23989218],
        [ 0.2113691 ,  0.7886309 ]])]

In [134]:
r = np.dstack((np.copy(num_reco_energy['light']), np.copy(num_reco_energy['heavy'])))[0]
for unfold_iter in range(50):
    print('Unfolding iteration {}...'.format(unfold_iter))
    if unfold_iter == 0:
        u = r
    fs = []
    for bin_idx in energy_bin_idx:
#         print(u)
        f = np.dot(response_mat[bin_idx], u[bin_idx])
        f[f < 0] = 0
        fs.append(f)
#         print(f)
    u = u + (r - fs)
#     u[u < 0] = 0
#     print(u)
unfolded_counts_iter = {}
unfolded_counts_iter['light'] = u[:,0]
unfolded_counts_iter['heavy'] = u[:,1]
unfolded_counts_iter['total'] = u.sum(axis=1)
print(unfolded_counts_iter)


Unfolding iteration 0...
Unfolding iteration 1...
Unfolding iteration 2...
Unfolding iteration 3...
Unfolding iteration 4...
Unfolding iteration 5...
Unfolding iteration 6...
Unfolding iteration 7...
Unfolding iteration 8...
Unfolding iteration 9...
Unfolding iteration 10...
Unfolding iteration 11...
Unfolding iteration 12...
Unfolding iteration 13...
Unfolding iteration 14...
Unfolding iteration 15...
Unfolding iteration 16...
Unfolding iteration 17...
Unfolding iteration 18...
Unfolding iteration 19...
Unfolding iteration 20...
Unfolding iteration 21...
Unfolding iteration 22...
Unfolding iteration 23...
Unfolding iteration 24...
Unfolding iteration 25...
Unfolding iteration 26...
Unfolding iteration 27...
Unfolding iteration 28...
Unfolding iteration 29...
Unfolding iteration 30...
Unfolding iteration 31...
Unfolding iteration 32...
Unfolding iteration 33...
Unfolding iteration 34...
Unfolding iteration 35...
Unfolding iteration 36...
Unfolding iteration 37...
Unfolding iteration 38...
Unfolding iteration 39...
Unfolding iteration 40...
Unfolding iteration 41...
Unfolding iteration 42...
Unfolding iteration 43...
Unfolding iteration 44...
Unfolding iteration 45...
Unfolding iteration 46...
Unfolding iteration 47...
Unfolding iteration 48...
Unfolding iteration 49...
{'heavy': array([ 303701.43653016,  241211.27404287,  163948.17757578,
        102626.3802578 ,   70692.20884366,   46642.5710298 ,
         32419.55282374,   22355.39549195,   15088.13580149,
         10197.17390693,    7197.59084351,    4861.16140094,
          3351.02448979,    2395.3345935 ,    1520.2817676 ,
          1027.42518063,     613.06883126]), 'light': array([  4.29377227e+05,   2.77284802e+05,   1.85884853e+05,
         1.25532190e+05,   7.52343857e+04,   4.53842925e+04,
         2.47564234e+04,   1.30573195e+04,   6.28028114e+03,
         3.06218741e+03,   1.70118771e+03,   8.19755039e+02,
         4.49884526e+02,   1.44692843e+02,   2.81304825e+01,
        -6.23410099e+01,  -9.21848439e+01]), 'total': array([  7.33078663e+05,   5.18496077e+05,   3.49833031e+05,
         2.28158570e+05,   1.45926595e+05,   9.20268635e+04,
         5.71759763e+04,   3.54127150e+04,   2.13684169e+04,
         1.32593613e+04,   8.89877856e+03,   5.68091644e+03,
         3.80090902e+03,   2.54002744e+03,   1.54841225e+03,
         9.65084171e+02,   5.20883987e+02])}

In [135]:
fig, ax = plt.subplots()
for composition in comp_list + ['total']:
    # Calculate dN/dE
    y = unfolded_counts_iter[composition]/energy_bin_widths
    y_err = np.sqrt(y)/energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / livetime
    y_err = y / livetime
    # Add energy scaling 
#     energy_err = get_energy_res(df_sim, energy_bins)
#     energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = energy_midpoints**2.7 * y
    y_err = energy_midpoints**2.7 * y_err
    print(y)
    print(y_err)
#     ax.errorbar(log_energy_midpoints, y, yerr=y_err, label=composition, color=color_dict[composition],
#            marker='.', markersize=8)
    plotting.plot_steps(log_energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
# ax.set_xscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.3, 8])
ax.set_ylim([10**3, 10**5])
ax.grid(linestyle='dotted', which="both")
leg = plt.legend(loc='upper center', frameon=False,
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
# plt.savefig('/home/jbourbeau/public_html/figures/spectrum.png')
plt.show()


/home/jbourbeau/.local/lib/python2.7/site-packages/ipykernel/__main__.py:5: RuntimeWarning: invalid value encountered in sqrt
[ 30608.90300256  25739.19353959  26090.26032924  24994.4368117
  22855.5941019   20394.79482403  16744.52586455  12750.96672149
   8963.77697026   6521.34060238   5355.5471064    3821.85911214
   3190.60289416   1505.68516182    435.52646429  -1382.82112343
  -3470.66606373]
[  1.12889612e-03   9.49294908e-04   9.62242708e-04   9.21827312e-04
   8.42944014e-04   7.52186538e-04   6.17559875e-04   4.70272224e-04
   3.30595744e-04   2.40515517e-04   1.97519537e-04   1.40955131e-04
   1.17673581e-04   5.55316256e-05   1.60627820e-05  -5.10002401e-05
  -1.28002675e-04]
[ 21649.88554024  22390.63811411  23011.29201211  20433.71168285
  21475.71774523  20960.24006155  21927.64402967  21830.88988231
  21535.1321382   21716.25553755  22658.89678563  22663.68989536
  23765.62834405  24926.04108309  23537.56085353  22789.89776587
  23081.42094915]
[ 0.00079848  0.0008258   0.00084869  0.00075362  0.00079205  0.00077304
  0.00080872  0.00080515  0.00079424  0.00080092  0.00083569  0.00083587
  0.00087651  0.0009193   0.0008681   0.00084052  0.00085127]
[ 52258.78854279  48129.8316537   49101.55234134  45428.14849454
  44331.31184713  41355.03488558  38672.16989423  34581.8566038
  30498.90910846  28237.59613994  28014.44389202  26485.54900751
  26956.23123821  26431.7262449   23973.08731782  21407.07664245
  19610.75488542]
[ 0.00192737  0.00177509  0.00181093  0.00167545  0.001635    0.00152523
  0.00142628  0.00127542  0.00112484  0.00104144  0.00103321  0.00097682
  0.00099418  0.00097484  0.00088416  0.00078952  0.00072327]

In [106]:
print(num_reco_energy)


{'heavy': array([343258, 251555, 170285, 109209,  72054,  46292,  30287,  19434,
        12697,   8279,   5666,   3863,   2611,   1814,   1134,    785,
          464]), 'light': array([402751, 268661, 180815, 120827,  74227,  45655,  26422,  14939,
         8234,   4684,   2849,   1589,    967,    519,    280,    155,
           77]), 'total': array([746009, 520216, 351100, 230036, 146281,  91947,  56709,  34373,
        20931,  12963,   8515,   5452,   3578,   2333,   1414,    940,
          541])}

In [107]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
# correctly_identified_mask = (test_predictions == y_test)
# confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)/len(y_pred)
true_comp = le.inverse_transform(y_test_sim)
pred_comp = le.inverse_transform(test_predictions)
confmat = confusion_matrix(true_comp, pred_comp, labels=comp_list)

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='None', cmap=cmap,
               vmin=0, vmax=1.0)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, '{:0.3f}'.format(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True composition')
    plt.xlabel('Predicted composition')

fig, ax = plt.subplots()
plot_confusion_matrix(confmat, classes=['light', 'heavy'], normalize=True,
                      title='Confusion matrix, without normalization')

# # Plot normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
#                       title='Normalized confusion matrix')

plt.show()


Normalized confusion matrix
[[ 0.77924806  0.22075194]
 [ 0.31332376  0.68667624]]

In [63]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
test_predictions = pipeline.predict(X_test_sim)
# correctly_identified_mask = (test_predictions == y_test)
# confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)/len(y_pred)
true_comp = le.inverse_transform(y_test_sim)
pred_comp = le.inverse_transform(test_predictions)
confmat = confusion_matrix(true_comp, pred_comp, labels=comp_list)

inverse = np.linalg.inv(confmat)
inverse


Out[63]:
array([[  4.65885e-05,  -1.58473e-05],
       [ -1.80751e-05,   5.14300e-05]])

In [64]:
confmat


Out[64]:
array([[24379,  7512],
       [ 8568, 22084]])

In [66]:
comp_list = ['light', 'heavy']
# Get number of events per energy bin
num_reco_energy, num_reco_energy_err = get_num_comp_reco(X_train_sim, y_train_sim, X_test_data, comp_list)
# Energy-related variables
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
energy_bin_widths = 10**energy_bins[1:] - 10**energy_bins[:-1]
def get_energy_res(df_sim, energy_bins):
    reco_log_energy = df_sim['lap_log_energy'].values 
    MC_log_energy = df_sim['MC_log_energy'].values
    energy_res = reco_log_energy - MC_log_energy
    bin_centers, bin_medians, energy_err = comp.analysis.data_functions.get_medians(reco_log_energy,
                                                                               energy_res,
                                                                               energy_bins)
    return np.abs(bin_medians)
# Solid angle
solid_angle = 2*np.pi*(1-np.cos(np.arccos(0.85)))
# solid_angle = 2*np.pi*(1-np.cos(40*(np.pi/180)))
print(solid_angle)
print(2*np.pi*(1-np.cos(40*(np.pi/180))))
# Live-time information
start_time = np.amin(df_data['start_time_mjd'].values)
end_time = np.amax(df_data['end_time_mjd'].values)
day_to_sec = 24 * 60 * 60.
dt = day_to_sec * (end_time - start_time)
print(dt)
# Plot fraction of events vs energy
fig, ax = plt.subplots()
for i, composition in enumerate(comp_list):
    num_reco_bin = np.array([[i, j] for i, j in zip(num_reco_energy['light'], num_reco_energy['heavy'])])
#     print(num_reco_bin)
    num_reco = np.array([np.dot(inverse, i) for i in num_reco_bin])
    print(num_reco)
    num_reco_2 = {'light': num_reco[:, 0], 'heavy': num_reco[:, 1]}
    # Calculate dN/dE
    y = num_reco_2[composition]/energy_bin_widths
    y_err = num_reco_energy_err[composition]/energy_bin_widths
    # Add effective area
    y, y_err = comp.analysis.ratio_error(y, y_err, eff_area, eff_area_error)
    # Add solid angle
    y = y / solid_angle
    y_err = y_err / solid_angle
    # Add time duration
    y = y / dt
    y_err = y / dt
    # Add energy scaling 
    energy_err = get_energy_res(df_sim, energy_bins)
    energy_err = np.array(energy_err)
#     print(10**energy_err)
    y = (10**energy_midpoints)**2.7 * y
    y_err = (10**energy_midpoints)**2.7 * y_err
    plotting.plot_steps(energy_midpoints, y, y_err, ax, color_dict[composition], composition)
ax.set_yscale("log", nonposy='clip')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('$\mathrm{E}^{2.7} \\frac{\mathrm{dN}}{\mathrm{dE dA d\Omega dt}} \ [\mathrm{GeV}^{1.7} \mathrm{m}^{-2} \mathrm{sr}^{-1} \mathrm{s}^{-1}]$')
ax.set_xlim([6.2, 8.0])
# ax.set_ylim([10**2, 10**5])
ax.grid()
leg = plt.legend(loc='upper center', 
          bbox_to_anchor=(0.5,  # horizontal
                          1.1),# vertical 
          ncol=len(comp_list)+1, fancybox=False)
# set the linewidth of each legend object
for legobj in leg.legendHandles:
    legobj.set_linewidth(3.0)
    
plt.show()


0.942477796077
1.46998611753
1409662.44128
[[  1.09286e+00   4.25348e-01]
 [  6.78230e-01   4.21116e-01]
 [  3.64535e-01   4.12274e-01]
 [  2.15254e-01   3.05955e-01]
 [  1.50773e-01   1.80818e-01]
 [  9.30369e-02   1.16594e-01]
 [  6.14838e-02   6.87016e-02]
 [  2.92043e-02   5.37845e-02]
 [  1.02666e-02   4.00306e-02]
 [  2.43373e-03   2.84436e-02]
 [  4.68133e-03   1.48927e-02]
 [  1.87090e-03   1.15002e-02]
 [ -6.73351e-04   9.90623e-03]
 [ -2.40343e-04   5.57233e-03]
 [ -2.92290e-04   4.14347e-03]
 [ -1.09387e-04   2.35180e-03]
 [ -2.28537e-04   1.67352e-03]
 [ -9.98517e-05   9.89654e-04]]
[[  1.09286e+00   4.25348e-01]
 [  6.78230e-01   4.21116e-01]
 [  3.64535e-01   4.12274e-01]
 [  2.15254e-01   3.05955e-01]
 [  1.50773e-01   1.80818e-01]
 [  9.30369e-02   1.16594e-01]
 [  6.14838e-02   6.87016e-02]
 [  2.92043e-02   5.37845e-02]
 [  1.02666e-02   4.00306e-02]
 [  2.43373e-03   2.84436e-02]
 [  4.68133e-03   1.48927e-02]
 [  1.87090e-03   1.15002e-02]
 [ -6.73351e-04   9.90623e-03]
 [ -2.40343e-04   5.57233e-03]
 [ -2.92290e-04   4.14347e-03]
 [ -1.09387e-04   2.35180e-03]
 [ -2.28537e-04   1.67352e-03]
 [ -9.98517e-05   9.89654e-04]]

In [44]:
pipeline.get_params()['classifier__max_depth']


Out[44]:
6

In [47]:
energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
fig, axarr = plt.subplots(1, 2)
for composition, ax in zip(comp_list, axarr.flatten()):
    MC_comp_mask = (df_sim['MC_comp_class'] == composition)
    MC_log_energy = df_sim['MC_log_energy'][MC_comp_mask].values
    reco_log_energy = df_sim['lap_log_energy'][MC_comp_mask].values
    plotting.histogram_2D(MC_log_energy, reco_log_energy, energy_bins, log_counts=True, ax=ax)
    ax.plot([0,10], [0,10], marker='None', linestyle='-.')
    ax.set_xlim([6.2, 8])
    ax.set_ylim([6.2, 8])
    ax.set_xlabel('$\log_{10}(E_{\mathrm{MC}}/\mathrm{GeV})$')
    ax.set_ylabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
    ax.set_title('{} response matrix'.format(composition))
plt.tight_layout()
plt.show()



In [10]:
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
10**energy_bins[1:] - 10**energy_bins[:-1]


Out[10]:
array([   410369.12250776,    516624.1165407 ,    650391.2286588 ,
          818794.04536659,   1030800.63073774,   1297701.1085292 ,
         1633708.90244087,   2056717.65275717,   2589254.11794165,
         3259677.80666943,   4103691.22507761,   5166241.16540694,
         6503912.28658791,   8187940.45366581,  10308006.30737735,
        12977011.08529189,  16337089.02440856,  20567176.52757148])

In [ ]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
prob_1 = probs[:, 0][MC_iron_mask]
prob_2 = probs[:, 1][MC_iron_mask]
# print(min(prob_1-prob_2))
# print(max(prob_1-prob_2))
# plt.hist(prob_1-prob_2, bins=30, log=True)
plt.hist(prob_1, bins=np.linspace(0, 1, 50), log=True)
plt.hist(prob_2, bins=np.linspace(0, 1, 50), log=True)

In [ ]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
dp1 = (probs[:, 0]-probs[:, 1])[MC_proton_mask]
print(min(dp1))
print(max(dp1))
dp2 = (probs[:, 0]-probs[:, 1])[MC_iron_mask]
print(min(dp2))
print(max(dp2))
fig, ax = plt.subplots()
# plt.hist(prob_1-prob_2, bins=30, log=True)
counts, edges, pathes = plt.hist(dp1, bins=np.linspace(-1, 1, 100), log=True, label='Proton', alpha=0.75)
counts, edges, pathes = plt.hist(dp2, bins=np.linspace(-1, 1, 100), log=True, label='Iron', alpha=0.75)
plt.legend(loc=2)
plt.show()
pipeline.named_steps['classifier'].classes_

In [ ]:
print(pipeline.named_steps['classifier'].classes_)
le.inverse_transform(pipeline.named_steps['classifier'].classes_)

In [ ]:
pipeline.named_steps['classifier'].decision_path(X_test)

In [48]:
comp_list = ['light', 'heavy']
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train_sim, y_train_sim)
# test_probs = defaultdict(list)
fig, ax = plt.subplots()
test_predictions = pipeline.predict(X_test_data)
test_probs = pipeline.predict_proba(X_test_data)
for class_ in pipeline.classes_:
    test_predictions == le.inverse_transform(class_)
    plt.hist(test_probs[:, class_], bins=np.linspace(0, 1, 50),
             histtype='step', label=composition,
             color=color_dict[composition], alpha=0.8, log=True)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend()
plt.grid()
plt.show()



In [5]:
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)

comp_list = ['P', 'He', 'O', 'Fe']
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for composition, ax in zip(comp_list, axarr.flatten()):
    comp_mask = (le.inverse_transform(y_test) == composition)
    probs = np.copy(test_probs[comp_mask])
    print('probs = {}'.format(probs.shape))
    weighted_mass = np.zeros(len(probs))
    for class_ in pipeline.classes_:
        c = le.inverse_transform(class_)
        weighted_mass += comp.simfunctions.comp2mass(c) * probs[:, class_]
    print('min = {}'.format(min(weighted_mass)))
    print('max = {}'.format(max(weighted_mass)))
    ax.hist(weighted_mass, bins=np.linspace(0, 5, 100),
             histtype='step', label=None, color='darkgray',
             alpha=1.0, log=False)
    for c in comp_list:
        ax.axvline(comp.simfunctions.comp2mass(c), color=color_dict[c],
                   marker='None', linestyle='-')
    ax.set_ylabel('Counts')
    ax.set_xlabel('Weighted atomic number')
    ax.set_title('MC {}'.format(composition))
    ax.grid()
plt.tight_layout()
plt.show()


probs = (15883, 4)
min = 1.38011981318
max = 3.56902754108
probs = (15920, 4)
min = 1.40518894839
max = 3.5562126663
probs = (15507, 4)
min = 1.45593414673
max = 3.57278805564
probs = (15066, 4)
min = 1.50426220527
max = 3.60566706999

In [15]:
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
test_predictions = pipeline.predict(X_test)

comp_list = ['P', 'He', 'O', 'Fe']
fig, ax = plt.subplots()
test_probs = pipeline.predict_proba(X_test)
fig, axarr = plt.subplots(2, 2, sharex=True, sharey=True)
for composition, ax in zip(comp_list, axarr.flatten()):
    comp_mask = (le.inverse_transform(y_test) == composition)
    probs = np.copy(test_probs[comp_mask])
    weighted_mass = np.zeros(len(probs))
    for class_ in pipeline.classes_:
        c = le.inverse_transform(class_)
        ax.hist(probs[:, class_], bins=np.linspace(0, 1, 50),
                 histtype='step', label=c, color=color_dict[c],
                 alpha=1.0, log=True)
    ax.legend(title='Reco comp', framealpha=0.5)
    ax.set_ylabel('Counts')
    ax.set_xlabel('Testing set class probabilities')
    ax.set_title('MC {}'.format(composition))
    ax.grid()
plt.tight_layout()
plt.show()



In [25]:
comp_list = ['light', 'heavy']
test_probs = defaultdict(list)
fig, ax = plt.subplots()
# test_probs = pipeline.predict_proba(X_test)
for event in pipeline.predict_proba(X_test_data):
    composition = le.inverse_transform(np.argmax(event))
    test_probs[composition].append(np.amax(event))
for composition in comp_list:
    plt.hist(test_probs[composition], bins=np.linspace(0, 1, 100),
             histtype='step', label=composition,
             color=color_dict[composition], alpha=0.8, log=False)
plt.ylabel('Counts')
plt.xlabel('Testing set class probabilities')
plt.legend(title='Reco comp')
plt.grid()
plt.show()



NotFittedErrorTraceback (most recent call last)
<ipython-input-25-56957a68d00e> in <module>()
      3 fig, ax = plt.subplots()
      4 # test_probs = pipeline.predict_proba(X_test)
----> 5 for event in pipeline.predict_proba(X_test_data):
      6     composition = le.inverse_transform(np.argmax(event))
      7     test_probs[composition].append(np.amax(event))

/home/jbourbeau/.local/lib/python2.7/site-packages/sklearn/utils/metaestimators.pyc in <lambda>(*args, **kwargs)
     52 
     53         # lambda, but not partial, allows help() to work with update_wrapper
---> 54         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
     55         # update the docstring of the returned function
     56         update_wrapper(out, self.fn)

/home/jbourbeau/.local/lib/python2.7/site-packages/sklearn/pipeline.pyc in predict_proba(self, X)
    377         for name, transform in self.steps[:-1]:
    378             if transform is not None:
--> 379                 Xt = transform.transform(Xt)
    380         return self.steps[-1][-1].predict_proba(Xt)
    381 

/home/jbourbeau/.local/lib/python2.7/site-packages/sklearn/preprocessing/data.pyc in transform(self, X, y, copy)
    639             The data used to scale along the features axis.
    640         """
--> 641         check_is_fitted(self, 'scale_')
    642 
    643         copy = copy if copy is not None else self.copy

/home/jbourbeau/.local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_is_fitted(estimator, attributes, msg, all_or_any)
    688     if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
    689         # FIXME NotFittedError_ --> NotFittedError in 0.19
--> 690         raise _NotFittedError(msg % {'name': type(estimator).__name__})
    691 
    692 

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [ ]:


In [ ]: