In [1]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
sys.path


Out[1]:
['',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/lib/python2.7/site-packages/setuptools-15.2-py2.7.egg',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/lib/python2.7/site-packages/setuptools-15.2-py2.7.egg',
 '/home/jbourbeau/.local/lib/python2.7/site-packages',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/i3ports/root-v5.34.18/lib',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/lib/python2.7/site-packages',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/i3ports/lib/python2.7/site-packages',
 '/data/user/jbourbeau/metaprojects/icerec/V05-00-00/build/lib',
 '/home/jbourbeau/cr-composition/analysis',
 '/home/jbourbeau',
 '/home/jbourbeau/useful',
 '/home/jbourbeau/anisotropy',
 '/home/jbourbeau/ShowerLLH_scripts',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/lib/python27.zip',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/lib/python2.7',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/lib/python2.7/plat-linux2',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/lib/python2.7/lib-tk',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/lib/python2.7/lib-old',
 '/cvmfs/icecube.opensciencegrid.org/py2-v1/RHEL_6_x86_64/lib/python2.7/lib-dynload',
 '/home/jbourbeau/.local/lib/python2.7/site-packages/IPython/extensions',
 '/home/jbourbeau/.ipython',
 '/home/jbourbeau/cr-composition']

In [2]:
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn.apionly as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve, GridSearchCV, cross_val_score, ParameterGrid

import composition as comp

%matplotlib inline


/home/jbourbeau/.local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [3]:
sns.set_palette('muted')
sns.set_color_codes()

In [4]:
df, cut_dict = comp.load_sim(return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['LLHLF_reco_exists', 'LLHLF_zenith', 'num_hits_1_30', 'IT_signal',
                     'StationDensity', 'max_qfrac_1_30', 'LLHLF_containment', 'energy_range']
# standard_cut_keys = ['LLHlap_reco_exists', 'LLHlap_zenith', 'num_hits_1_30', 'IT_signal',
#                      'StationDensity', 'max_qfrac_1_30', 'LLHlap_containment', 'energy_range']
# standard_cut_keys = ['reco_exists', 'reco_zenith', 'num_hits_1_60', 'IT_signal',
#                      'StationDensity', 'max_qfrac_1_60', 'reco_containment', 'energy_range']
for key in standard_cut_keys:
    selection_mask *= cut_dict[key]

df = df[selection_mask]

feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(
    df, feature_list)

print('number training events = ' + str(y_train.shape[0]))


/home/jbourbeau/cr-composition/composition/load_sim.py:87: RuntimeWarning: divide by zero encountered in log10
  df['reco_log_energy'] = np.nan_to_num(np.log10(df['reco_energy']))
training features = ['reco_log_energy', 'InIce_log_charge_1_30', 'LLHLF_cos_zenith', 'lap_chi2', 'NChannels_1_30', 'log_s125']
number training events = 33009

Get error in charge vs. energy distribution


In [5]:
pipeline = comp.get_pipeline('tpot')
pipeline.steps


Out[5]:
[('featureunion', FeatureUnion(n_jobs=1,
         transformer_list=[('votingclassifier', VotingClassifier(estimators=[('est', GaussianNB(priors=None))], n_jobs=1,
           voting='hard', weights=None)), ('functiontransformer', FunctionTransformer(accept_sparse=False,
            func=<function <lambda> at 0x699d2a8>, inv_kw_args=None,
            inverse_func=None, kw_args=None, pass_y=False, validate=True))],
         transformer_weights=None)),
 ('pca',
  PCA(copy=True, iterated_power=10, n_components=None, random_state=None,
    svd_solver='randomized', tol=0.0, whiten=False)),
 ('randomforestclassifier',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
              verbose=0, warm_start=False))]

In [ ]:
pipeline = comp.get_pipeline('tpot')
param_range = np.arange(2, 10, 1)
param_grid = {'randomforestclassifier__max_depth': param_range}
gs = GridSearchCV(estimator=pipeline,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5,
                  verbose=2,
                  n_jobs=1)
gs = gs.fit(X_train, y_train)
print('best GS CV score = {}'.format(gs.best_score_))
print('best GS CV depths = {}'.format(gs.best_params_))
print('Grid scores on development set:')
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
# pipeline.set_params(**gs.best_params_)
# pipeline.fit(X_train, y_train)
# scaler = pipeline.named_steps['scaler']
# clf = pipeline.named_steps['classifier']


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] randomforestclassifier__max_depth=2 .............................
[CV] .................... randomforestclassifier__max_depth=2 -   0.7s
[CV] randomforestclassifier__max_depth=2 .............................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.3s remaining:    0.0s
[CV] .................... randomforestclassifier__max_depth=2 -   0.8s
[CV] randomforestclassifier__max_depth=2 .............................
[CV] .................... randomforestclassifier__max_depth=2 -   0.6s
[CV] randomforestclassifier__max_depth=2 .............................
[CV] .................... randomforestclassifier__max_depth=2 -   0.6s
[CV] randomforestclassifier__max_depth=2 .............................
[CV] .................... randomforestclassifier__max_depth=2 -   0.7s
[CV] randomforestclassifier__max_depth=3 .............................
[CV] .................... randomforestclassifier__max_depth=3 -   0.5s
[CV] randomforestclassifier__max_depth=3 .............................
[CV] .................... randomforestclassifier__max_depth=3 -   0.7s
[CV] randomforestclassifier__max_depth=3 .............................
[CV] .................... randomforestclassifier__max_depth=3 -   0.7s
[CV] randomforestclassifier__max_depth=3 .............................
[CV] .................... randomforestclassifier__max_depth=3 -   0.8s
[CV] randomforestclassifier__max_depth=3 .............................
[CV] .................... randomforestclassifier__max_depth=3 -   0.5s
[CV] randomforestclassifier__max_depth=4 .............................
[CV] .................... randomforestclassifier__max_depth=4 -   0.5s
[CV] randomforestclassifier__max_depth=4 .............................
[CV] .................... randomforestclassifier__max_depth=4 -   0.7s
[CV] randomforestclassifier__max_depth=4 .............................

In [5]:
pipeline = comp.get_pipeline('tpot')
pipeline.fit(X_train, y_train)
pipeline.__class__.__name__


Out[5]:
'Pipeline'

In [6]:
clf_name = pipeline.__class__.__name__
print('=' * 30)
print(clf_name)
test_predictions = pipeline.predict(X_test)
test_acc = accuracy_score(y_test, test_predictions)
print('Test accuracy: {:.4%}'.format(test_acc))
train_predictions = pipeline.predict(X_train)
train_acc = accuracy_score(y_train, train_predictions)
print('Train accuracy: {:.4%}'.format(train_acc))
# scores = cross_val_score(
#     estimator=pipeline, X=X_test, y=y_test, cv=10, n_jobs=1)
# print('CV score: {:.2%} (+/- {:.2%})'.format(scores.mean(), scores.std()))
print('=' * 30)


==============================
Pipeline
Test accuracy: 82.8798%
Train accuracy: 100.0000%
==============================

In [7]:
correctly_identified_mask = (test_predictions == y_test)

energy_bin_width = 0.1
energy_bins = np.arange(6.2, 8.1, energy_bin_width)
# energy_bins = np.arange(6.2, 9.51, energy_bin_width)
energy_midpoints = (energy_bins[1:] + energy_bins[:-1]) / 2
log_energy = X_test[:, 0]

MC_proton_mask = (le.inverse_transform(y_test) == 'P')
MC_iron_mask = (le.inverse_transform(y_test) == 'Fe')
# Get number of MC proton and iron as a function of MC energy
num_MC_protons_energy = np.histogram(log_energy[MC_proton_mask],
                                     bins=energy_bins)[0]
num_MC_protons_energy_err = np.sqrt(num_MC_protons_energy)
num_MC_irons_energy = np.histogram(log_energy[MC_iron_mask],
                                   bins=energy_bins)[0]
num_MC_irons_energy_err = np.sqrt(num_MC_irons_energy)
num_MC_total_energy = np.histogram(log_energy, bins=energy_bins)[0]
num_MC_total_energy_err = np.sqrt(num_MC_total_energy)

# Get number of reco proton and iron as a function of MC energy
num_reco_proton_energy = np.histogram(
    log_energy[MC_proton_mask & correctly_identified_mask],
    bins=energy_bins)[0]
num_reco_proton_energy_err = np.sqrt(num_reco_proton_energy)
num_reco_iron_energy = np.histogram(
    log_energy[MC_iron_mask & correctly_identified_mask],
    bins=energy_bins)[0]
num_reco_iron_energy_err = np.sqrt(num_reco_iron_energy)
num_reco_total_energy = np.histogram(
    log_energy[correctly_identified_mask],
    bins=energy_bins)[0]
num_reco_total_energy_err = np.sqrt(num_reco_total_energy)

# Calculate reco proton and iron fractions as a function of MC energy
reco_proton_frac, reco_proton_frac_err = comp.ratio_error(
    num_reco_proton_energy, num_reco_proton_energy_err,
    num_MC_protons_energy, num_MC_protons_energy_err)

reco_iron_frac, reco_iron_frac_err = comp.ratio_error(
    num_reco_iron_energy, num_reco_iron_energy_err,
    num_MC_irons_energy, num_MC_irons_energy_err)

reco_total_frac, reco_total_frac_err = comp.ratio_error(
    num_reco_total_energy, num_reco_total_energy_err,
    num_MC_total_energy, num_MC_total_energy_err)

In [8]:
# Plot fraction of events vs energy
fig, ax = plt.subplots()
ax.errorbar(energy_midpoints, reco_proton_frac,
            yerr=reco_proton_frac_err,
            # xerr=energy_bin_width / 2,
            marker='.', markersize=10,
            label='Proton')
ax.errorbar(energy_midpoints, reco_iron_frac,
            yerr=reco_iron_frac_err,
            # xerr=energy_bin_width / 2,
            marker='.', markersize=10,
            label='Iron')
ax.errorbar(energy_midpoints, reco_total_frac,
            yerr=reco_total_frac_err,
            # xerr=energy_bin_width / 2,
            marker='.', markersize=10,
            label='Total')
plt.xlabel('$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$')
ax.set_ylabel('Fraction correctly identified')
ax.set_ylim([0.0, 1.0])
ax.set_xlim([6.2, 8.0])
# ax.set_xlim([6.2, 9.5])
plt.grid()
plt.legend(loc=3)
# place a text box in upper left in axes coords
textstr = 'Training features: \n'
for i, label in enumerate(feature_labels):
    if (i == len(feature_labels)-1):
        textstr += '{}) '.format(i+1) + label
    else:
        textstr += '{}) '.format(i+1) + label + '\n'
print('textstr = \n' + textstr)
props = dict(facecolor='white')
ax.text(0.7, 0.3, textstr, transform=ax.transAxes, fontsize=8,
        verticalalignment='top', bbox=props)
outfile = '/home/jbourbeau/public_html/figures/composition' + \
          '/fraction-reco-correct_vs_reco-energy_RF.png'
plt.savefig(outfile)
plt.show()
print(reco_total_frac)


textstr = 
Training features: 
1) $\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$
2) InIce charge (top 50\%)
3) $\cos(\theta_{\mathrm{LLH+COG}})$
4) $\chi^2_{\mathrm{Lap}}/\mathrm{n.d.f}$
5) NChannels (top 50\%)
6) $\log_{10}(S_{\mathrm{125}})$
[ 0.82190265  0.81196581  0.83333333  0.85010707  0.84742268  0.82890855
  0.81779661  0.81818182  0.83757962  0.83830455  0.81081081  0.83202512
  0.83361345  0.81049069  0.83523654  0.85077187  0.84333821  0.80663616]

In [11]:
num_features = len(feature_list)
importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

fig, ax = plt.subplots()
# feature_labels = np.array(['$\\log_{10}({\mathrm{E/GeV})$', 'InIce charge',
#                            '$\cos(\\theta)$', '$\mathrm{Laputop}\ \chi^2/\mathrm{n.d.f.}$', 'NChannels'])
for f in range(num_features):
    print('{}) {}'.format(f + 1, importances[indices[f]]))

plt.ylabel('Feature Importances')
plt.bar(range(num_features),
        importances[indices],
        align='center')

plt.xticks(range(num_features),
           feature_labels[indices], rotation=90)
plt.xlim([-1, len(feature_list)])
plt.ylim([0, .40])
plt.show()



AttributeErrorTraceback (most recent call last)
<ipython-input-11-1690194e4629> in <module>()
      1 num_features = len(feature_list)
----> 2 importances = pipeline.named_steps['classifier'].feature_importances_
      3 indices = np.argsort(importances)[::-1]
      4 
      5 fig, ax = plt.subplots()

AttributeError: 'KNeighborsClassifier' object has no attribute 'feature_importances_'

In [10]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
prob_1 = probs[:, 0][MC_iron_mask]
prob_2 = probs[:, 1][MC_iron_mask]
# print(min(prob_1-prob_2))
# print(max(prob_1-prob_2))
# plt.hist(prob_1-prob_2, bins=30, log=True)
plt.hist(prob_1, bins=np.linspace(0, 1, 50), log=True)
plt.hist(prob_2, bins=np.linspace(0, 1, 50), log=True)


Out[10]:
(array([  2.00000000e+00,   3.10000000e+01,   5.24000000e+02,
          7.77000000e+02,   2.02700000e+03,   2.64900000e+03,
          7.03000000e+02,   1.18000000e+02,   3.50000000e+01,
          7.00000000e+00,   2.00000000e+00,   2.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.00000000e+00,   2.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00]),
 array([ 0.        ,  0.02040816,  0.04081633,  0.06122449,  0.08163265,
         0.10204082,  0.12244898,  0.14285714,  0.16326531,  0.18367347,
         0.20408163,  0.2244898 ,  0.24489796,  0.26530612,  0.28571429,
         0.30612245,  0.32653061,  0.34693878,  0.36734694,  0.3877551 ,
         0.40816327,  0.42857143,  0.44897959,  0.46938776,  0.48979592,
         0.51020408,  0.53061224,  0.55102041,  0.57142857,  0.59183673,
         0.6122449 ,  0.63265306,  0.65306122,  0.67346939,  0.69387755,
         0.71428571,  0.73469388,  0.75510204,  0.7755102 ,  0.79591837,
         0.81632653,  0.83673469,  0.85714286,  0.87755102,  0.89795918,
         0.91836735,  0.93877551,  0.95918367,  0.97959184,  1.        ]),
 <a list of 49 Patch objects>)

In [11]:
probs = pipeline.named_steps['classifier'].predict_proba(X_test)
dp1 = (probs[:, 0]-probs[:, 1])[MC_proton_mask]
print(min(dp1))
print(max(dp1))
dp2 = (probs[:, 0]-probs[:, 1])[MC_iron_mask]
print(min(dp2))
print(max(dp2))
fig, ax = plt.subplots()
# plt.hist(prob_1-prob_2, bins=30, log=True)
counts, edges, pathes = plt.hist(dp1, bins=np.linspace(-1, 1, 100), log=True, label='Proton', alpha=0.75)
counts, edges, pathes = plt.hist(dp2, bins=np.linspace(-1, 1, 100), log=True, label='Iron', alpha=0.75)
plt.legend(loc=2)
plt.show()
pipeline.named_steps['classifier'].classes_


-0.181446352644
0.976533857996
-0.101248002643
0.975933857996
Out[11]:
array([0, 1])

In [12]:
print(pipeline.named_steps['classifier'].classes_)
le.inverse_transform(pipeline.named_steps['classifier'].classes_)


[0 1]
Out[12]:
array(['Fe', 'P'], dtype=object)

In [13]:
pipeline.named_steps['classifier'].decision_path(X_test)


Out[13]:
(<14147x694890 sparse matrix of type '<type 'numpy.int64'>'
 	with 19243683 stored elements in Compressed Sparse Row format>,
 array([     0,   7047,  13470,  20773,  27974,  35149,  41448,  48387,
         55696,  62693,  69796,  76533,  83826,  90853,  98064, 105223,
        111478, 118187, 125196, 131995, 138500, 145857, 152970, 159613,
        166858, 174085, 181236, 187877, 195112, 201497, 208524, 215683,
        222988, 230345, 237718, 244321, 251140, 257961, 265336, 272169,
        278984, 285313, 292358, 299691, 306962, 314047, 321310, 328261,
        335644, 342869, 350128, 357317, 364462, 371425, 378714, 385583,
        392688, 399891, 406886, 413735, 420680, 427243, 434242, 441499,
        448432, 455393, 462526, 469699, 476204, 483049, 489802, 496651,
        503748, 510789, 516850, 523781, 531098, 538215, 545404, 552077,
        558406, 564769, 571830, 578987, 586058, 592511, 599776, 607011,
        614402, 620707, 627910, 634999, 642028, 646705, 653084, 659563,
        666242, 673555, 680490, 687781, 694890]))

In [ ]: