In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend


last updated: 2017-02-14 

CPython 2.7.10
IPython 5.2.2

numpy 1.12.0
matplotlib 2.0.0
scipy 0.18.1
pandas 0.19.2
sklearn 0.18
mlxtend 0.5.0

In [2]:
import sys
sys.path.append('/home/jbourbeau/cr-composition')
print('Added to PYTHONPATH')


Added to PYTHONPATH

In [12]:
%matplotlib inline
from __future__ import division, print_function
from collections import defaultdict
import itertools
import numpy as np
from scipy import interp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import cross_val_score, ShuffleSplit, KFold, StratifiedKFold
from sklearn.cluster import KMeans
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

import composition as comp
import composition.analysis.plotting as plotting
    
color_dict = {'light': 'C0', 'heavy': 'C1', 'total': 'C2',
             'P': 'C0', 'He': 'C1', 'O': 'C3', 'Fe':'C4'}

Define analysis free parameters

[ back to top ]

Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions


In [4]:
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']

Get composition classifier pipeline


In [5]:
pipeline_str = 'xgboost'
pipeline = comp.get_pipeline(pipeline_str)

Define energy binning for this analysis


In [6]:
energybins = comp.analysis.get_energybins()

Data preprocessing

[ back to top ]

  1. Load simulation/data dataframe and apply specified quality cuts
  2. Extract desired features from dataframe
  3. Get separate testing and training datasets
  4. Feature transformation

In [7]:
sim_train, sim_test = comp.preprocess_sim(comp_class=comp_class, return_energy=True)


sim quality cut event flow:
             IceTopQualityCuts:    1.0    1.0
         lap_InIce_containment:  0.776  0.776
             reco_energy_range:  0.654  0.493
                 num_hits_1_30:  0.996  0.493
                max_qfrac_1_30:  0.998  0.493
              InIceQualityCuts:  0.784  0.486


/home/jbourbeau/cr-composition/composition/dataframe_functions.py:124: RuntimeWarning: divide by zero encountered in log10
  df['log_NChannels_'+i] = np.log10(df['NChannels_'+i])
/home/jbourbeau/cr-composition/composition/dataframe_functions.py:125: RuntimeWarning: divide by zero encountered in log10
  df['log_NHits_'+i] = np.log10(df['NHits_'+i])
Selecting the following features:
	$\cos(\theta_{\mathrm{Lap}})$
	$\log_{10}(S_{\mathrm{125}})$
	$\log_{10}(InIce charge (top 50))$
	Charge/NChannels
	NHits/NChannels
	dE/dX (standard)
	
Number training events = 134262
Number testing events = 57541

In [8]:
data = comp.preprocess_data(comp_class=comp_class, return_energy=True)


data quality cut event flow:
             IceTopQualityCuts:    1.0    1.0
         lap_InIce_containment:    1.0    1.0
             reco_energy_range:    1.0    1.0
                 num_hits_1_30:    1.0    1.0
                max_qfrac_1_30:    1.0    1.0
              InIceQualityCuts:  0.957  0.957


Selecting the following features:
	$\cos(\theta_{\mathrm{Lap}})$
	$\log_{10}(S_{\mathrm{125}})$
	$\log_{10}(InIce charge (top 50))$
	Charge/NChannels
	NHits/NChannels
	dE/dX (standard)
	
Number testing events = 2124113

Run classifier over training and testing sets to get an idea of the degree of overfitting


In [9]:
clf_name = pipeline.named_steps['classifier'].__class__.__name__
print('=' * 30)
print(clf_name)
pipeline.fit(sim_train.X, sim_train.y)
train_pred = pipeline.predict(sim_train.X)
train_acc = accuracy_score(sim_train.y, train_pred)
print('Training accuracy = {:.2%}'.format(train_acc))
test_pred = pipeline.predict(sim_test.X)
test_acc = accuracy_score(sim_test.y, test_pred)
print('Testing accuracy = {:.2%}'.format(test_acc))
# scores = cross_val_score(
#     estimator=pipeline, X=sim_train.X, y=sim_train.y, cv=3, n_jobs=10)
# print('CV score: {:.2%} (+/- {:.2%})'.format(scores.mean(), scores.std()))
print('=' * 30)


==============================
XGBClassifier
Training accuracy = 78.01%
Testing accuracy = 77.66%
==============================

In [13]:
splitter = ShuffleSplit(n_splits=1, test_size=.5, random_state=2)
for set1_index, set2_index in splitter.split(sim_train.X):
    sim_train1 = sim_train[set1_index]
    sim_train2 = sim_train[set2_index]

In [35]:
kmeans = KMeans(n_clusters=4)

In [36]:
pred = kmeans.fit_predict(sim_train.X)

In [37]:
MC_comp_mask = {}
for composition in comp_list:
    MC_comp_mask[composition] = sim_train.le.inverse_transform(sim_train.y) == composition
MC_comp_mask


Out[37]:
{'heavy': array([False, False,  True, ..., False, False,  True], dtype=bool),
 'light': array([ True,  True, False, ...,  True,  True, False], dtype=bool)}

In [38]:
light_0 = np.sum(pred[MC_comp_mask['light']] == 0)/np.sum(MC_comp_mask['light'])
light_1 = np.sum(pred[MC_comp_mask['light']] == 1)/np.sum(MC_comp_mask['light'])
print('percent light cluster in 0 = {}'.format(light_0))
print('percent light cluster in 1 = {}'.format(light_1))


percent light cluster in 0 = 0.209510965169
percent light cluster in 1 = 0.712721355694

In [39]:
heavy_0 = np.sum(pred[MC_comp_mask['heavy']] == 0)/np.sum(MC_comp_mask['heavy'])
heavy_1 = np.sum(pred[MC_comp_mask['heavy']] == 1)/np.sum(MC_comp_mask['heavy'])
print('percent heavy cluster in 0 = {}'.format(heavy_0))
print('percent heavy cluster in 1 = {}'.format(heavy_1))


percent heavy cluster in 0 = 0.247993822487
percent heavy cluster in 1 = 0.533506949702

In [ ]: