notebook.community

Edit and run



In [63]:

    
from __future__ import division
import os
from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import comptools as comp

color_dict = comp.color_dict

%matplotlib inline



In [8]:

    
config = 'IC86.2012'
num_groups = 4



In [18]:

    
comp_list = comp.get_comp_list(num_groups=num_groups)
energybins = comp.get_energybins(config=config)
log_energy_min = energybins.log_energy_min
log_energy_max = energybins.log_energy_max

# Load simulation training/testing DataFrames
print('Loading simulation training/testing DataFrames...')
df_sim_train, df_sim_test = comp.load_sim(config=config,
                                          log_energy_min=log_energy_min,
                                          log_energy_max=log_energy_max,
                                          test_size=0.5,
                                          verbose=True)









    



Loading simulation training/testing DataFrames...
[########################################] | 100% Completed |  4.8s



In [59]:

    
label_dict = {'reco_log_energy': '$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$',
              'lap_log_energy': '$\log_{10}(E_{\mathrm{Lap}}/\mathrm{GeV})$',
              'log_s50': '$\log_{10}(S_{\mathrm{50}})$',
              'log_s80': '$\log_{10}(S_{\mathrm{80}})$',
              'log_s125': '$\log_{10}(S_{\mathrm{125}})$',
              'log_s180': '$\log_{10}(S_{\mathrm{180}})$',
              'log_s250': '$\log_{10}(S_{\mathrm{250}})$',
              'log_s500': '$\log_{10}(S_{\mathrm{500}})$',
              'lap_rlogl': '$r\log_{10}(l)$',
              'lap_beta': 'lap beta',
              'InIce_log_charge_1_60': 'InIce charge',
              'InIce_log_charge_1_45': 'InIce charge (top 75\%)',
              'InIce_charge_1_30': 'InIce charge (top 50\%)',
              'InIce_log_charge_1_30': '$\log_{10}(InIce charge (top 50))$',
              'InIce_log_charge_1_15': 'InIce charge (top 25\%)',
              'InIce_log_charge_1_6': 'InIce charge (top 10\%)',
              'reco_cos_zenith': '$\cos(\\theta_{\mathrm{reco}})$',
              'lap_cos_zenith': '$\cos(\\theta_{\mathrm{Lap}})$',
              'LLHlap_cos_zenith': '$\cos(\\theta_{\mathrm{Lap}})$',
              'LLHLF_cos_zenith': '$\cos(\\theta_{\mathrm{LLH+COG}})$',
              'lap_chi2': '$\chi^2_{\mathrm{Lap}}/\mathrm{n.d.f}$',
              'NChannels_1_60': 'NChannels',
              'NChannels_1_45': 'NChannels (top 75\%)',
              'NChannels_1_30': 'NChannels (top 50\%)',
              'NChannels_1_15': 'NChannels (top 25\%)',
              'NChannels_1_6': 'NChannels (top 10\%)',
              'log_NChannels_1_30': '$\log_{10}$(NChannels (top 50\%))',
              'StationDensity': 'StationDensity',
              'charge_nchannels_ratio': 'Charge/NChannels',
              'stationdensity_charge_ratio': 'StationDensity/Charge',
              'NHits_1_30': 'NHits',
              'log_NHits_1_30': '$\log_{10}$(NHits (top 50\%))',
              'charge_nhits_ratio': 'Charge/NHits',
              'nhits_nchannels_ratio': 'NHits/NChannels',
              'stationdensity_nchannels_ratio': 'StationDensity/NChannels',
              'stationdensity_nhits_ratio': 'StationDensity/NHits',
              'llhratio': 'llhratio',
              'n_he_stoch_standard': 'Num HE stochastics (standard)',
              'n_he_stoch_strong': 'Num HE stochastics (strong)',
              'eloss_1500_standard': 'dE/dX (standard)',
              'log_dEdX': '$\mathrm{\log_{10}(dE/dX)}$',
              'eloss_1500_strong': 'dE/dX (strong)',
              'num_millipede_particles': '$N_{\mathrm{mil}}$',
              'avg_inice_radius': '$\mathrm{\langle R_{\mu} \\rangle }$',
              'invqweighted_inice_radius_1_60': '$\mathrm{R_{\mu \ bundle}}$',
              'avg_inice_radius_1_60': '$\mathrm{R_{\mu \ bundle}}$',
              'avg_inice_radius_Laputop': '$R_{\mathrm{core, Lap}}$',
              'FractionContainment_Laputop_InIce': '$C_{\mathrm{IC}}$',
              'Laputop_IceTop_FractionContainment': '$C_{\mathrm{IT}}$',
              'max_inice_radius': '$R_{\mathrm{max}}$',
              'invcharge_inice_radius': '$R_{\mathrm{q,core}}$',
              'lap_zenith': 'zenith',
              'NStations': 'NStations',
              'IceTop_charge': 'IT charge',
              'IceTop_charge_175m': 'Signal greater 175m',
              'log_IceTop_charge_175m': '$\log_{10}(Q_{IT, 175})$',
              'IT_charge_ratio': 'IT charge ratio',
              'refit_beta': '$\mathrm{\\beta_{refit}}$',
              'log_d4r_peak_energy': '$\mathrm{\log_{10}(E_{D4R})}$',
              'log_d4r_peak_sigma': '$\mathrm{\log_{10}(\sigma E_{D4R})}$',
              'd4r_N': 'D4R N',
              'median_inice_radius': 'Median InIce'
              }



In [60]:

    
feature_list = ['lap_cos_zenith', 'log_s125', 'log_dEdX', 'avg_inice_radius']
# feature_list = ['lap_cos_zenith', 'log_s125', 'log_dEdX', 'avg_inice_radius', 'StationDensity']
feature_labels = [label_dict[feature] for feature in feature_list]



In [64]:

    
d = pd.DataFrame(df_sim_train, columns=feature_list)
# Compute the correlation matrix
corr = d.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots()
sns.heatmap(corr, mask=mask, cmap='RdBu_r', center=0, vmin=-1, vmax=1,
            square=True, xticklabels=feature_labels, yticklabels=feature_labels,
            linewidths=.5, cbar_kws={'label': 'Covariance'}, annot=True, ax=ax)

corr_outfile = os.path.join(comp.paths.figures_dir, 'feature_distributions',
                            'correlation-matrix-{}.png'.format(config))
comp.check_output_dir(corr_outfile)
plt.savefig(corr_outfile)
plt.show()



In [65]:

    
with sns.axes_style(rc={'axes.grid': True}) as _:
# with mpl.rc_context(rc={'text.usetex': False}) as _, sns.axes_style(rc={'axes.grid': True}) as _:
    s125_mask = (df_sim_train['log_s125'] < 1.0) & (df_sim_train['log_s125'] > 0.9)
    g = sns.pairplot(df_sim_train[s125_mask], vars=feature_list,
                     hue='comp_group_{}'.format(num_groups),
                     hue_order=comp_list, 
                     palette=color_dict,
                     diag_kws={'bins': 20, 'alpha': 0.7},
                     plot_kws={'lw': 0, 'alpha': 0.2})
    
    # Make PairPlot lower diagonal
    for i, j in zip(*np.triu_indices_from(g.axes, 1)):
        g.axes[i, j].set_visible(False)
        
    # Set axes labels to use formatted feature_labels
    for idx in range(len(g.axes)):
        g.axes[idx, 0].set_ylabel(feature_labels[idx])
        g.axes[-1, idx].set_xlabel(feature_labels[idx])

    # Modify legend title
    g.fig.get_children()[-1].set_title('MC composition')
    
    # Save figure
    pairplot_outfile = os.path.join(comp.paths.figures_dir, 'feature_distributions',
                                    'pairplot-{}.png'.format(config))
    comp.check_output_dir(pairplot_outfile)
    plt.savefig(pairplot_outfile)



In [ ]:



In [ ]:



In [14]:

    
with mpl.rc_context(rc={'text.usetex': False}):
    for feature in feature_list:
        fig, ax  = plt.subplots()
        df_sim_train[feature].plot(kind='hist', bins=100, ax=ax)
        ax.set_xlabel(feature)
        plt.show()









    




AttributeErrorTraceback (most recent call last)
<ipython-input-14-2048b314cc85> in <module>()
      2     for feature in feature_list:
      3         fig, ax  = plt.subplots()
----> 4         df_sim_train[feature].plot(kind='hist', bins=100, hue='comp_group_2', ax=ax)
      5         ax.set_xlabel(feature)
      6         plt.show()

/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/pandas/plotting/_core.pyc in __call__(self, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   2501                            colormap=colormap, table=table, yerr=yerr,
   2502                            xerr=xerr, label=label, secondary_y=secondary_y,
-> 2503                            **kwds)
   2504     __call__.__doc__ = plot_series.__doc__
   2505 

/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/pandas/plotting/_core.pyc in plot_series(data, kind, ax, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, label, secondary_y, **kwds)
   1925                  yerr=yerr, xerr=xerr,
   1926                  label=label, secondary_y=secondary_y,
-> 1927                  **kwds)
   1928 
   1929 

/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/pandas/plotting/_core.pyc in _plot(data, x, y, subplots, ax, kind, **kwds)
   1727         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   1728 
-> 1729     plot_obj.generate()
   1730     plot_obj.draw()
   1731     return plot_obj.result

/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/pandas/plotting/_core.pyc in generate(self)
    250         self._compute_plot_data()
    251         self._setup_subplots()
--> 252         self._make_plot()
    253         self._add_table()
    254         self._make_legend()

/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/pandas/plotting/_core.pyc in _make_plot(self)
   1357             kwds = self._make_plot_keywords(kwds, y)
   1358             artists = self._plot(ax, y, column_num=i,
-> 1359                                  stacking_id=stacking_id, **kwds)
   1360             self._add_legend_handle(artists[0], label, index=i)
   1361 

/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/pandas/plotting/_core.pyc in _plot(cls, ax, y, style, bins, bottom, column_num, stacking_id, **kwds)
   1335             cls._get_stacked_values(ax, stacking_id, base, kwds['label'])
   1336         # ignore style
-> 1337         n, bins, patches = ax.hist(y, bins=bins, bottom=bottom, **kwds)
   1338         cls._update_stacker(ax, stacking_id, n)
   1339         return patches

/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/matplotlib/__init__.pyc in inner(ax, *args, **kwargs)
   1715                     warnings.warn(msg % (label_namer, func.__name__),
   1716                                   RuntimeWarning, stacklevel=2)
-> 1717             return func(ax, *args, **kwargs)
   1718         pre_doc = inner.__doc__
   1719         if pre_doc is None:

/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in hist(***failed resolving arguments***)
   6351             if patch:
   6352                 p = patch[0]
-> 6353                 p.update(kwargs)
   6354                 if lbl is not None:
   6355                     p.set_label(lbl)

/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/matplotlib/artist.pyc in update(self, props)
    900         try:
    901             ret = [_update_property(self, k, v)
--> 902                    for k, v in props.items()]
    903         finally:
    904             self.eventson = store

/home/jbourbeau/.virtualenvs/composition/lib/python2.7/site-packages/matplotlib/artist.pyc in _update_property(self, k, v)
    893                 func = getattr(self, 'set_' + k, None)
    894                 if not callable(func):
--> 895                     raise AttributeError('Unknown property %s' % k)
    896                 return func(v)
    897 

AttributeError: Unknown property hue



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [3]:

    
df, cut_dict = comp.load_sim(type_='sim', config='IT73', return_cut_dict=True)
selection_mask = np.array([True] * len(df))
standard_cut_keys = ['lap_reco_success', 'lap_zenith', 'num_hits_1_30', 'IT_signal',
                     'max_qfrac_1_30', 'lap_containment', 'energy_range_lap']
for key in standard_cut_keys:
    selection_mask *= cut_dict[key]

df = df[selection_mask]

# feature_list, feature_labels = comp.get_training_features()
feature_list = np.array(['lap_log_energy', 'InIce_log_charge_1_30', 'lap_cos_zenith',
                         'log_NChannels_1_30', 'log_s125', 'StationDensity',
                         'charge_nchannels_ratio', 'charge_nhits_ratio',
                         'stationdensity_charge_ratio', 'nchannels_nhits_ratio', 
                         'stationdensity_nchannels_ratio', 'stationdensity_nhits_ratio',
                         'lap_likelihood', 'log_NHits_1_30', 'log_s50', 'log_s80', 
                         'log_s180', 'log_s250', 'log_s500'])
label_dict = {'reco_log_energy': '$\log_{10}(E_{\mathrm{reco}}/\mathrm{GeV})$',
              'lap_log_energy': '$\log_{10}(E_{\mathrm{Lap}}/\mathrm{GeV})$',
              'log_s50': '$\log_{10}(S_{\mathrm{50}})$',
              'log_s80': '$\log_{10}(S_{\mathrm{80}})$',
              'log_s125': '$\log_{10}(S_{\mathrm{125}})$',
              'log_s180': '$\log_{10}(S_{\mathrm{180}})$',
              'log_s250': '$\log_{10}(S_{\mathrm{250}})$',
              'log_s500': '$\log_{10}(S_{\mathrm{500}})$',
              'lap_likelihood': '$r\log_{10}(l)$',
              'InIce_charge_1_30': 'InIce charge (top 50\%)',
              'InIce_log_charge_1_30': '$\log_{10}$(InIce charge (top 50\%))',
              'lap_cos_zenith': '$\cos(\\theta_{\mathrm{Lap}})$',
              'LLHlap_cos_zenith': '$\cos(\\theta_{\mathrm{Lap}})$',
              'LLHLF_cos_zenith': '$\cos(\\theta_{\mathrm{LLH+COG}})$',
              'lap_chi2': '$\chi^2_{\mathrm{Lap}}/\mathrm{n.d.f}$',
              'NChannels_1_30': 'NChannels (top 50\%)',
              'log_NChannels_1_30' : '$\log_{10}$(NChannels (top 50\%))',
              'StationDensity': 'StationDensity',
              'charge_nchannels_ratio': 'Charge/NChannels',
              'stationdensity_charge_ratio': 'StationDensity/Charge', 
              'NHits_1_30': 'NHits',
              'log_NHits_1_30': '$\log_{10}$(NHits (top 50\%))',
              'charge_nhits_ratio': 'Charge/NHits',
              'nchannels_nhits_ratio': 'NChannels/NHits', 
              'stationdensity_nchannels_ratio': 'StationDensity/NChannels',
              'stationdensity_nhits_ratio': 'StationDensity/NHits'
              }
feature_labels = np.array([label_dict[feature] for feature in feature_list])

print('training features = {}'.format(feature_list))
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(
    df, feature_list, comp_class=True)

print('number training events = ' + str(y_train.shape[0]))
print('number testing events = ' + str(y_test.shape[0]))









    



/home/jbourbeau/cr-composition/composition/load_dataframe.py:88: RuntimeWarning: divide by zero encountered in log10
  df['log_NChannels_1_30'] = np.nan_to_num(np.log10(df['NChannels_1_30']))
/home/jbourbeau/cr-composition/composition/load_dataframe.py:89: RuntimeWarning: divide by zero encountered in log10
  df['log_NHits_1_30'] = np.nan_to_num(np.log10(df['NHits_1_30']))






    



training features = ['lap_log_energy' 'InIce_log_charge_1_30' 'lap_cos_zenith'
 'log_NChannels_1_30' 'log_s125' 'StationDensity' 'charge_nchannels_ratio'
 'charge_nhits_ratio' 'stationdensity_charge_ratio' 'nchannels_nhits_ratio'
 'stationdensity_nchannels_ratio' 'stationdensity_nhits_ratio'
 'lap_likelihood' 'log_NHits_1_30' 'log_s50' 'log_s80' 'log_s180'
 'log_s250' 'log_s500']
number training events = 145932
number testing events = 62543



In [4]:

    
d = pd.DataFrame(df, columns=feature_list)
# Compute the correlation matrix
corr = d.corr()
# corr = d.corr().values
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots()
sns.heatmap(corr, mask=mask, cmap='RdBu_r', center=0,
            square=True, xticklabels=feature_labels, yticklabels=feature_labels,
            linewidths=.5, cbar_kws={'label': 'Covariance'}, annot=False, ax=ax)
plt.show()









    



/home/jbourbeau/.local/lib/python2.7/site-packages/seaborn/matrix.py:143: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if xticklabels == []:
/home/jbourbeau/.local/lib/python2.7/site-packages/seaborn/matrix.py:151: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if yticklabels == []:



In [5]:

    
d.var()









    Out[5]:





lap_log_energy                      0.260772
InIce_log_charge_1_30               0.278172
lap_cos_zenith                      0.001151
log_NChannels_1_30                  0.053439
log_s125                            0.302581
StationDensity                      0.016176
charge_nchannels_ratio            181.702621
charge_nhits_ratio                  0.523746
stationdensity_charge_ratio         0.000008
nchannels_nhits_ratio               0.010576
stationdensity_nchannels_ratio      0.000042
stationdensity_nhits_ratio          0.000007
lap_likelihood                      5.191823
log_NHits_1_30                      0.167208
log_s50                             0.317268
log_s80                             0.308305
log_s180                            0.299919
log_s250                            0.299073
log_s500                            0.302115
dtype: float64



In [6]:

    
feature_list, feature_labels = comp.get_training_features()
print('training features = {}'.format(feature_list))
num_features = len(feature_list)
X_train, X_test, y_train, y_test, le = comp.get_train_test_sets(df, feature_list)









    



training features = ['lap_log_energy', 'InIce_log_charge_1_30', 'lap_cos_zenith', 'NChannels_1_30', 'log_s125', 'lap_likelihood']



In [6]:

    
pipeline = comp.get_pipeline('RF')
pipeline.fit(X_train, y_train)
scaler = pipeline.named_steps['scaler']
clf = pipeline.named_steps['classifier']
clf_name = clf.__class__.__name__



In [7]:

    
test_predictions = pipeline.predict(X_test)
test_acc = accuracy_score(y_test, test_predictions)
print('Test accuracy: {:.4%}'.format(test_acc))
train_predictions = pipeline.predict(X_train)
train_acc = accuracy_score(y_train, train_predictions)
print('Train accuracy: {:.4%}'.format(train_acc))









    



Test accuracy: 73.7285%
Train accuracy: 73.9248%



In [8]:

    
importances = pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

fig, ax = plt.subplots()
# feature_labels = np.array(['$\\log_{10}({\mathrm{E/GeV})$', 'InIce charge',
#                            '$\cos(\\theta)$', '$\mathrm{Laputop}\ \chi^2/\mathrm{n.d.f.}$', 'NChannels'])
for f in range(num_features):
    print('{}) {}'.format(f + 1, importances[indices[f]]))

plt.ylabel('Feature Importances')
plt.bar(range(num_features),
        importances[indices],
        align='center')

plt.xticks(range(num_features),
           feature_labels[indices], rotation=90)
plt.xlim([-1, len(feature_list)])
plt.show()









    



1) 0.381360127017
2) 0.265909889116
3) 0.14942532069
4) 0.115531686641
5) 0.0877729765361



In [15]:

    
corr = d.corr()



In [16]:

    
df.MC_log_energy.plot(kind='hist', xlim=[6.2, 8], bins=np.arange(6.2, 8, 0.05))
plt.xlabel('$\\log_{10}({\mathrm{E/GeV})$')









    Out[16]:





<matplotlib.text.Text at 0x6c31e50>



In [17]:

    
df.reco_log_energy.plot(kind='hist', xlim=[6.2, 8], bins=np.arange(6.2, 8, 0.05))
plt.xlabel('$\\log_{10}({\mathrm{E/GeV})$')









    Out[17]:





<matplotlib.text.Text at 0x93e9950>



In [18]:

    
fig, ax = plt.subplots()
df.NChannels[df.MC_comp == 'P'].plot(kind='hist', bins=np.linspace(0, 1200, 50), alpha=0.5, label='P')
df.NChannels[df.MC_comp == 'Fe'].plot(kind='hist', bins=np.linspace(0, 1200, 50), alpha=0.5, label='Fe')
plt.legend()









    Out[18]:





<matplotlib.legend.Legend at 0x8975150>



In [13]:

    
fig, ax = plt.subplots()
df.lap_likelihood[df.MC_comp == 'P'].plot(kind='hist', bins=np.linspace(-30, 0, 75), alpha=0.5, label='P', log=True)
df.lap_likelihood[df.MC_comp == 'He'].plot(kind='hist', bins=np.linspace(-30, 0, 75), alpha=0.5, label='He')
df.lap_likelihood[df.MC_comp == 'Fe'].plot(kind='hist', bins=np.linspace(-30, 0, 75), alpha=0.5, label='Fe')
plt.legend()









    Out[13]:





<matplotlib.legend.Legend at 0xba8c990>



In [10]:

    
fig, ax = plt.subplots()
df.lap_ndf[df.MC_comp == 'P'].plot(kind='hist', bins=np.linspace(0, 300, 100), alpha=0.5, label='P', log=True)
df.lap_ndf[df.MC_comp == 'He'].plot(kind='hist', bins=np.linspace(0, 300, 100), alpha=0.5, label='He')
df.lap_ndf[df.MC_comp == 'Fe'].plot(kind='hist', bins=np.linspace(0, 300, 100), alpha=0.5, label='Fe')
plt.legend()









    Out[10]:





<matplotlib.legend.Legend at 0xbf45d10>



In [ ]: