Author: James Bourbeau
In [1]:
%load_ext watermark
%watermark -u -d -v -p numpy,matplotlib,scipy,pandas,sklearn,mlxtend
In [2]:
from __future__ import division, print_function
from collections import defaultdict
import itertools
import numpy as np
from scipy import optimize
import pandas as pd
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import pyprind
import multiprocessing as mp
from sklearn.model_selection import ShuffleSplit
import composition as comp
import composition.analysis.plotting as plotting
# color_dict allows for a consistent color-coding for each composition
color_dict = comp.analysis.get_color_dict()
%matplotlib inline
[ back to top ]
Whether or not to train on 'light' and 'heavy' composition classes, or the individual compositions
In [3]:
comp_class = True
comp_list = ['light', 'heavy'] if comp_class else ['P', 'He', 'O', 'Fe']
Get composition classifier pipeline
In [4]:
pipeline_str = 'GBDT'
pipeline = comp.get_pipeline(pipeline_str)
Define energy binning for this analysis
In [5]:
energybins = comp.analysis.get_energybins()
[ back to top ]
In [6]:
sim_train, sim_test = comp.preprocess_sim(comp_class=comp_class, return_energy=True)
In [7]:
splitter = ShuffleSplit(n_splits=1, test_size=.5, random_state=2)
for train_index, verification_index in splitter.split(sim_train.X):
sim_verification = sim_train[verification_index]
sim_train = sim_train[train_index]
print('Number of training events = {}'.format(len(sim_train)))
print('Number of verification events = {}'.format(len(sim_verification)))
In [17]:
data = comp.preprocess_data(comp_class=comp_class, return_energy=True)
Run classifier over training and testing sets to get an idea of the degree of overfitting
In [8]:
pipeline.fit(sim_train.X, sim_train.y)
Out[8]:
In [9]:
fracs = defaultdict(list)
frac_array = np.arange(0.0, 1.1, 0.1)
for light_frac in frac_array:
print('On light_frac = {}'.format(light_frac))
for i in range(1000):
heavy_frac = 1 - light_frac
light_dataset = comp.analysis.get_random_subsample(sim_verification, frac=light_frac, composition='light')
heavy_dataset = comp.analysis.get_random_subsample(sim_verification, frac=heavy_frac, composition='heavy')
combined_dataset = light_dataset + heavy_dataset
pred = pipeline.predict(combined_dataset.X)
num_pred_light = np.sum(combined_dataset.le.inverse_transform(pred) == 'light')
frac_light = num_pred_light/len(combined_dataset)
fracs[light_frac].append(frac_light)
In [18]:
with sns.color_palette('viridis', len(frac_array)):
fig, ax = plt.subplots()
for light_frac in frac_array:
sns.distplot(fracs[light_frac], bins=np.linspace(0.0, 1.0, 100),
kde=False, label=str(light_frac), hist_kws={'alpha': 0.75})
ax.set_xlabel('Reconstructed fraction of light events')
ax.set_ylabel('Counts')
ax.set_xlim([0.1, 0.9])
ax.grid()
leg = plt.legend(title='Injected fraction\nof light events',
loc='center left', frameon=False,
bbox_to_anchor=(1.0, # horizontal
0.5),# vertical
ncol=1, fancybox=False)
plt.savefig('/home/jbourbeau/public_html/figures/light-frac-reconstructed-hists.png')
plt.show()
In [23]:
fig, ax = plt.subplots()
medians = []
errs = []
for light_frac in frac_array:
medians.append(np.median(fracs[light_frac]))
errs.append(np.std(fracs[light_frac]))
print(medians)
print(errs)
ax.errorbar(frac_array, medians, yerr=errs, marker='.', ls='None')
ax.set_xlabel('Injected fraction of light events')
ax.set_ylabel('Reconstructed fraction\nof light events')
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])
ax.set_aspect(1.0)
ax.grid()
plt.savefig('/home/jbourbeau/public_html/figures/light-frac-reconstructed-medians.png')
plt.show()
In [22]:
n_samples = 10000
injected_frac = np.random.ranf(n_samples)
reco_frac = []
bar = pyprind.ProgBar(n_samples)
for light_frac in injected_frac:
heavy_frac = 1 - light_frac
light_dataset = comp.analysis.get_random_subsample(sim_verification, frac=light_frac, composition='light')
heavy_dataset = comp.analysis.get_random_subsample(sim_verification, frac=heavy_frac, composition='heavy')
combined_dataset = light_dataset + heavy_dataset
pred = pipeline.predict(combined_dataset.X)
num_pred_light = np.sum(combined_dataset.le.inverse_transform(pred) == 'light')
frac_light = num_pred_light/len(combined_dataset)
reco_frac.append(frac_light)
bar.update()
In [9]:
def get_reco_frac(dataset, injected_light_fraction, pipeline):
print('WEWOWOW')
heavy_frac = 1 - injected_light_fraction
light_dataset = comp.analysis.get_random_subsample(dataset, frac=injected_light_fraction, composition='light')
heavy_dataset = comp.analysis.get_random_subsample(dataset, frac=heavy_frac, composition='heavy')
combined_dataset = light_dataset + heavy_dataset
pred = pipeline.predict(combined_dataset.X)
num_pred_light = np.sum(combined_dataset.le.inverse_transform(pred) == 'light')
frac_light = num_pred_light/len(combined_dataset)
return frac_light
In [11]:
get_reco_frac(comp, sim_train, 0.1, pipeline)
Out[11]:
In [ ]:
pool = mp.Pool(processes=1)
n_samples = 1
injected_frac = np.random.ranf(n_samples)
print('injected_frac = {}'.format(injected_frac))
results = [pool.apply(get_reco_frac, args=(sim_train, x, pipeline)) for x in injected_frac]
print(results)
In [38]:
reco_frac_median = stats.binned_statistic(injected_frac, reco_frac, bins=frac_bins, statistic='median')[0]
frac_midpoints = (frac_bins[1:] + frac_bins[:-1]) / 2
slope, intercept = np.polyfit(frac_midpoints, reco_frac_median, 1)
def linefit_response(x):
return intercept + slope*x
def inverse_response(y):
return (y - intercept) / slope
In [44]:
pred = pipeline.predict(data.X)
light_mask = sim_train.le.inverse_transform(pred) == 'light'
frac_light = np.sum(light_mask)/pred.shape[0]
print('light fraction = {}'.format(frac_light))
In [47]:
fig, ax = plt.subplots()
frac_bins = np.linspace(0.0, 1.0, 75)
plotting.histogram_2D(injected_frac, reco_frac, bins=frac_bins, ax=ax)
ax.plot(frac_midpoints, linefit_response(frac_midpoints), marker='None',
ls='-', lw=2, color='C1')
ax.axhline(frac_light, marker='None', ls='-.')
ax.axvline(inverse_response(frac_light), marker='None', ls='-.')
print(inverse_response(frac_light))
ax.set_xlabel('Injected fraction of light events')
ax.set_ylabel('Reconstructed fraction of light events')
ax.grid()
plt.savefig('/home/jbourbeau/public_html/figures/light-frac-reconstructed-2d.png')
plt.show()
In [24]:
reco_frac_std = stats.binned_statistic(reco_frac, injected_frac, bins=frac_bins, statistic=np.std)[0]
print(reco_frac_std)
frac_midpoints = (frac_bins[1:] + frac_bins[:-1]) / 2
linefit = lambda x, b: b
x = frac_midpoints[(frac_midpoints > 0.3) & (frac_midpoints < 0.7)]
y = reco_frac_std[(frac_midpoints > 0.3) & (frac_midpoints < 0.7)]
popt, pcov = optimize.curve_fit(linefit, x, y)
intercept = popt[0]
print(intercept)
yfit = linefit(x, intercept)
yfit = np.array([yfit for i in range(len(x))])
In [25]:
fig, ax = plt.subplots()
ax.plot(frac_midpoints, reco_frac_std, ls='None', ms=10)
ax.axhline(intercept, marker='None', lw=1, ls=':', color='k')
ax.annotate('{:.4f}'.format(intercept), xy=(0.3, intercept), xytext=(0.4, 0.018),
arrowprops=dict(arrowstyle='-|>', color='black', connectionstyle='arc3,rad=-0.3'), fontsize=8,
bbox=dict(boxstyle='round', fc="white", ec="gray", lw=0.8))
ax.grid()
ax.set_xlabel('Reconstructed fraction of light events')
ax.set_ylabel('1$\sigma$ spread in injected fraction')
plt.savefig('/home/jbourbeau/public_html/figures/light-frac-reconstructed-spread.png')
plt.show()
In [ ]:
In [ ]: