In [1]:
import os
from itertools import cycle
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.externals import joblib
from sklearn.metrics import zero_one_loss
from sklearn.model_selection import validation_curve
import comptools as comp
from submit_feature_scan import scan_features
color_dict = comp.color_dict
%matplotlib inline
In [2]:
config = 'IC86.2012'
num_groups = 2
# pipeline = 'BDT'
pipeline = 'xgboost'
energybins = comp.get_energybins()
comp_list = comp.get_comp_list(num_groups=num_groups)
nominal_features, _ = comp.get_training_features()
nominal_features
Out[2]:
In [3]:
pipeline_str = '{}_comp_{}_{}-groups'.format(pipeline, config, num_groups)
pipeline_str
Out[3]:
In [4]:
pattern = os.path.join(os.getcwd(),
'feature_scan_results',
'{}_{}-groups-*'.format(pipeline, num_groups))
files = glob.glob(pattern)
records = [joblib.load(results_file) for results_file in files]
results = pd.DataFrame.from_records(records)
Add number of features column
In [5]:
results['num_features'] = results.features.apply(len)
In [6]:
results.head()
Out[6]:
In [8]:
def format_label(label, labels):
# Check that label is in labels
has_label = any([True if label in l else False for l in labels])
if not has_label:
return labels
removed = [l for l in labels if label not in l]
formatted_label = '\\{' + label + '\\}'
formatted_labels = removed + [formatted_label]
return formatted_labels
In [9]:
has_random_features = results.features.apply(lambda x: 'random' in x).values
results_no_random = results.loc[~has_random_features].reset_index(drop=True)
In [11]:
cwd = os.getcwd()
linestyles = cycle(['-', '-.', '--', ':'])
plot_features = [('lap_cos_zenith', 'log_s125', 'IceTopLLHRatio'),
('lap_cos_zenith', 'log_s125', 'log_dEdX'),
('lap_cos_zenith', 'log_s125', 'log_dEdX', 'IceTopLLHRatio'),
# ('lap_cos_zenith', 'log_s125'),
]
results_plot = results_no_random.loc[results_no_random['features'].isin(plot_features), :].reset_index(drop=True)
color_dict_model = {}
color_dict_model['light'] = sns.color_palette('Blues', len(results_plot)+1).as_hex()[::-1]
color_dict_model['intermediate'] = sns.color_palette('Reds', len(results_plot)+1).as_hex()[::-1]
color_dict_model['heavy'] = sns.color_palette('Oranges', len(results_plot)+1).as_hex()[::-1]
color_dict_model['PPlus'] = sns.color_palette('Blues', len(results_plot)+1).as_hex()[::-1]
color_dict_model['O16Nucleus'] = sns.color_palette('Reds', len(results_plot)+1).as_hex()[::-1]
color_dict_model['He4Nucleus'] = sns.color_palette('Purples', len(results_plot)+1).as_hex()[::-1]
color_dict_model['Fe56Nucleus'] = sns.color_palette('Oranges', len(results_plot)+1).as_hex()[::-1]
fig, axarr = plt.subplots(ncols=len(comp_list), figsize=(12, 5), sharex=True, sharey=True)
for idx_row, row in results_plot.iterrows():
if 'random' in row['features']:
continue
# if len(row['features']) > 4:
# continue
# if len(row['features']) == 2:
# continue
# if len(row['features']) != 3:
# continue
# if len(row['features']) != 2 and set(row['features']) != set(nominal_features):
# continue
# if set(row['features']) != set(nominal_features):
# continue
print(len(row['features']))
print(row['features'])
print(row['best_params'])
ls = next(linestyles)
for idx, (composition, ax) in enumerate(zip(comp_list, axarr.flat)):
labels = row['feature_labels']
formatted_labels = format_label('NChannels', labels)
formatted_labels = format_label('NHits', formatted_labels)
formatted_labels = format_label('IT Q', formatted_labels)
# Add to plot
comp.plot_steps(energybins.log_energy_bins,
row['acc_mean_{}'.format(composition)],
yerr=row['acc_std_{}'.format(composition)],
# color=color_dict[composition],
color=color_dict_model[composition][idx_row],
ls=ls,
label='\n '.join(formatted_labels),
ax=ax)
ax.set_xlim(6.4, 7.9)
ax.set_ylim(0, 1)
ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
if idx == 0:
ax.set_ylabel('Accuracy')
ax.set_title(composition)
ax.grid(lw=0.8)
if idx == len(comp_list) - 1:
# ax.legend(title='Training features', fontsize=10)
leg = ax.legend(title='Training features',
loc='center left',
ncol=1,
bbox_to_anchor=(1, 0.5),
fontsize=10)
# Set the linewidth of each legend object
for legobj in leg.legendHandles:
legobj.set_linewidth(2.0)
# ax.legend(title='Training features', loc='lower left', fontsize=10)
outfile = os.path.join(comp.paths.figures_dir,
'model_evaluation',
'new_features',
'{}_num_groups-{}.png'.format(config, num_groups))
comp.check_output_dir(outfile)
plt.savefig(outfile)
plt.show()
In [97]:
# cwd = os.getcwd()
# linestyles = ['-', '-.', '--']
# fig, axarr = plt.subplots(ncols=len(comp_list), figsize=(12, 5), sharex=True, sharey=True)
# for features, ls in zip(scan_features, linestyles):
# features_str = '-'.join(features)
# results_basename = '{}-{}.pkl'.format(pipeline_str, features_str)
# results_file = os.path.join(cwd,
# 'feature_scan_results',
# results_basename)
# results = joblib.load(results_file)
# print(results['best_params'])
# for idx, (composition, ax) in enumerate(zip(comp_list, axarr.flat)):
# labels = results['feature_labels']
# formatted_labels = []
# nchannels_flag = False
# for label in labels:
# if 'NChannels' in label:
# if not nchannels_flag:
# formatted_labels.append('\{NChannels\}')
# nchannels_flag = True
# else:
# continue
# else:
# formatted_labels.append(label)
# # Add to plot
# comp.plot_steps(energybins.log_energy_bins,
# results['acc_mean_{}'.format(composition)],
# yerr=results['acc_std_{}'.format(composition)],
# color=color_dict[composition],
# ls=ls,
# # label=', '.join(results['feature_labels']),
# label='\n '.join(formatted_labels),
# ax=ax)
# ax.set_xlim(6.4, 7.9)
# ax.set_ylim(0, 1)
# ax.set_xlabel('$\mathrm{\log_{10}(E/GeV)}$')
# if idx == 0:
# ax.set_ylabel('Accuracy')
# ax.set_title(composition)
# ax.grid(lw=0.8)
# # if idx == len(comp_list) - 1:
# # # ax.legend(title='Training features', fontsize=10)
# # ax.legend(title='Training features', loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)
# # ax.legend(title='Training features', loc='lower left', fontsize=10)
# outfile = os.path.join(comp.paths.figures_dir,
# 'model_evaluation',
# 'new_features',
# '{}_num_groups-{}.png'.format(config, num_groups))
# comp.check_output_dir(outfile)
# plt.savefig(outfile)
# plt.show()
In [54]:
def plot_feature_importances(feature_importances, feature_labels):
num_features = len(feature_labels)
importances = feature_importances
indices = np.argsort(importances)[::-1]
fig, ax = plt.subplots()
for f in range(num_features):
print('{}) {}'.format(f + 1, importances[indices[f]]))
plt.ylabel('Feature Importances')
plt.bar(range(num_features),
importances[indices],
align='center')
plt.xticks(range(num_features),
feature_labels[indices],
rotation=90)
plt.xlim([-1, num_features])
# plt.ylim([0, .40])
ax.grid(axis='y')
plt.show()
In [55]:
# for features in scan_features:
for idx, row in results.iterrows():
# features_str = '-'.join(features)
# results_basename = '{}-{}.pkl'.format(pipeline_str, features_str)
# results_file = os.path.join(cwd,
# 'feature_scan_results',
# results_basename)
# results = joblib.load(results_file)
importances = row['pipeline'].named_steps['classifier'].feature_importances_
feature_labels = np.array(row['feature_labels'])
plot_feature_importances(importances, feature_labels)
In [27]:
feature_labels = np.array(results['feature_labels'])
num_features = len(feature_labels)
importances = results['pipeline'].named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]
fig, ax = plt.subplots()
for f in range(num_features):
print('{}) {}'.format(f + 1, importances[indices[f]]))
plt.ylabel('Feature Importances')
plt.bar(range(num_features),
importances[indices],
align='center')
plt.xticks(range(num_features),
feature_labels[indices],
rotation=90)
plt.xlim([-1, num_features])
# plt.ylim([0, .40])
ax.grid(axis='y')
plt.show()
In [40]:
df_sim_train, df_sim_test = comp.load_sim(config=config,
log_energy_min=energybins.log_energy_min,
log_energy_max=energybins.log_energy_max,
test_size=0.5,
verbose=True)
In [75]:
X = df_sim_train.loc[:, result['features']]
In [77]:
np.random.seed(2)
X['random'] = np.random.random(size=len(df_sim_train))
X = X.values
In [78]:
result
Out[78]:
In [80]:
y = df_sim_train.loc[:, 'comp_target_{}'.format(num_groups)].values
y
Out[80]:
In [83]:
train_scores, test_scores = validation_curve(result['pipeline'], X, y,
param_name='classifier__max_depth',
param_range=np.arange(1, 11),
cv=3,
scoring='accuracy',
n_jobs=1,
verbose=1)
In [88]:
params = np.arange(1, 11)
df_cv = comp.cross_validate_comp(
df_sim_train, df_sim_test, pipeline_str,
param_name='max_depth', param_values=params,
feature_list=list(result['features']),
target='comp_target_{}'.format(num_groups),
scoring=zero_one_loss, num_groups=num_groups,
n_splits=10, verbose=True,
n_jobs=1)
# n_jobs=min(len(params), 15))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [11]:
clf = results['pipeline'].named_steps['classifier'].feature_importances_
clf.feature_importances_
Out[11]:
In [12]:
clf.classes_
Out[12]:
In [14]:
clf.__dict__
Out[14]:
In [ ]: