In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import shap
import colors
# load JS visualization code to notebook
shap.initjs()
In [2]:
data_features = pd.read_hdf('/Users/OlgaKo/Desktop/from_lab14/data_features_JetHT.hdf5', "data").reset_index(drop=True)
labels = 1-pd.read_hdf('/Users/OlgaKo/Desktop/from_lab14/labels_JetHT.hdf5', 'labels').reset_index(drop=True)
sublabels = pd.read_hdf('/Users/OlgaKo/Desktop/from_lab14/data_not_f_JetHT.hdf5', "data").reset_index(drop=True)
In [3]:
data_features.shape, sublabels.shape
Out[3]:
In [61]:
dict_causes = {
273150:" [[61, 64], [66, 75]], Ecal excluded",
273296:" [[1, 94]], Pixe excluded",
273426:" [[1, 59], [64, 64], [66, 69]],inefficeincy in EE",
273445:" [[5, 6]], tracker off seen by many systems",
274142:" [[99, 100]], pixel and tracker off",
274157:" [[103, 534]], EB minus has region with low efficiency",
274282:" [[86, 88]], tracker is off --> particle flow object are affected",
275326:" [[1, 169]], sistrip excl",
275757:" [[104, 122]], low DCS: HBHE. nothing on PF plot",
275758:" [[1, 4]], part of hcal not available --> JetMET reconstruction suffers",
275764:" [[1, 31]], pixel off",
275766:" [[1, 23], [25, 60]], lower tracker efficiency",
275768:" [[1, 79]], lower tracker efficiency",
275769:" [[1, 22]], pixel not in Data Acquisition",
275781:" [[1, 29]], EB+16, +17, +18 are excluded due to a problem with VME crateS206h",
275783:" [[1, 1634]], strip EXCL",
275838:" [[1, 51]], strip EXCL",
275922:" [[4, 6]], low stat",
276064:" [[1, 22]], hot trigger tower seen in EE-07 (high occupancy and Et TP) which cause ~100% deadtime",
276071:" [[1, 22]], strip EXCL",
276095:" [[1, 5]], low stat",
276237:" [[1, 5]], ECAL, HCAL, PIXEL excluded",
276453:" [[1, 8], [10, 125]], EB-17 (FED 626): was excluded (because of cooling failure)",
276455:" [[1, 401]], strip EXCL",
276456:" [[1, 182]], strip EXCL",
276457:" [[1, 19]], sistrip not in DAQ",
277217:" [[12, 14]], Short collision run with strip in error in DCS and pixel in error in DAQ. For physically meaningful lumisections 33-47, the total rate is zero. L1T flags marked as bad due to this",
277933:" [[1, 42]], ECAL EE+09 FED 648 removed from all LS in the run because it was causing 100% dead time. EE+09 was not masked, so all LS in this run are bad",
278309:" [[1, 10]], EE-04 FED 607 TT disabled in LS [1-10] according to express dataset (LS# 10)",
278821:" [[1, 33], [36, 37]], FED652 in error, EE+04 is off",
279028:" [[1, 17]], strip EXCL",
279995:" [[1, 8]], hcal water colling issues",
280002:" [[1, 111]], ecal excluded",
280006:" [[1, 68]], EB-11 token ring missing",
280007:" [[1, 36]], Low voltage channel broken in EB",
280239:" [[1, 9]], strip EXCL",
280241:" [[1, 11]], strip EXCL",
281663:" [[61, 172]], Timing shifted by 5ns due to TCDS* problem ",
281674:" [[1, 45]], Timing shifted by 5ns due to TCDS* problem",
281680:" [[1, 31]], Timing shifted by 5ns due to TCDS* problem",
281974:" [[79, 85]], pixel High voltage OFF",
282408:" [[80, 191]], strip EXCL",
282707:" [[79, 81]], pixel High voltage OFF",
282796:" [[80, 82]], pixel High Voltage off",
282921:" [[1, 152]], strip EXCL"
}
In [5]:
indx_train = np.arange(data_features.shape[0]-int(data_features.shape[0]/5), dtype='int32')
indx_test = np.arange(data_features.shape[0]-int(data_features.shape[0]/5),data_features.shape[0], dtype='int32')
In [6]:
indx_known = sublabels[sublabels["runId"].isin(dict_causes.keys())].index.tolist()
ids_known = sublabels[['runId', 'lumiId']].iloc[indx_known]
In [7]:
indx_train_minus = [s for s in indx_train if (s not in indx_known)]
In [8]:
num_good = np.sum(labels)
num_bad = len(labels)-np.sum(labels)
weights = 0.5 / np.where(labels == 0.0, num_good, num_bad)
weights *= len(labels)
In [9]:
y_train = np.array(labels.iloc[indx_train_minus], 'float32')
y_known = np.array(labels.iloc[indx_known], 'float32')
y_test = np.array(labels.iloc[indx_test], 'float32')
X_train = np.array(data_features.iloc[indx_train_minus], 'float32')
X_known = np.array(data_features.iloc[indx_known], 'float32')
X_test = np.array(data_features.iloc[indx_test], 'float32')
weights_train = weights[indx_train_minus]
weights_test = weights[indx_test]
In [10]:
feature_names = data_features.columns
In [11]:
Muon_features = [s for s in feature_names if (s[:3] == 'qMu')]# and (s[3:7] != 'Cosm')]
Pho_features = [s for s in feature_names if s[:4] == 'qPho']
Cal_features = [s for s in feature_names if s[:4] == 'qCal']
PF_features = [s for s in feature_names if s[:3] == 'qPF']
channels_features = dict()
channels_features['muons'] = Muon_features
channels_features['photons'] = Pho_features
channels_features['PF'] = PF_features
channels_features['calo'] = Cal_features
In [12]:
num_fch = dict()
for g, fs in channels_features.items():
num_fch[g]=len(fs)
In [13]:
num_fch
Out[13]:
In [14]:
vmins = []
vmaxs = []
for f in data_features.columns:
values = data_features[f].values
# trim the color range, but prevent the color range from collapsing
vmin = np.nanpercentile(values, 5)
vmax = np.nanpercentile(values, 95)
if vmin == vmax:
vmin = np.nanpercentile(values, 1)
vmax = np.nanpercentile(values, 99)
if vmin == vmax:
vmin = np.min(values)
vmax = np.max(values)
vmins.append(vmin)
vmaxs.append(vmax)
In [51]:
vmins = np.array(vmins)
vmaxs = np.array(vmaxs)
In [15]:
X_ref=X_test[np.random.choice(np.where(y_test==1)[0], size=100, replace=False)]
In [16]:
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(n_estimators=80, max_depth=10, learning_rate=0.1, silent=False)
xgb_clf.fit(X_train, y_train)
Out[16]:
In [18]:
model = xgb_clf
In [22]:
shap_values = shap.TreeExplainer(model).shap_values(X_known)
shap_ref = shap.TreeExplainer(model).shap_values(X_ref)
In [69]:
def summary_plot_custom(shap_values, features=None, feature_names=None, max_display=20,
vmins=None, vmaxs=None, return_oder=True,
axis_color="#333333", title=None, alpha=1, show=True, sort=True,
color_bar=True, auto_size_plot=True):
num_features = shap_values.shape[1]
if sort:
# order features by the sum of their effect magnitudes
feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0))
feature_order = feature_order[-min(max_display, len(feature_order)):]
else:
feature_order = np.arange(min(max_display, num_features))
row_height = 0.4
if auto_size_plot:
pl.gcf().set_size_inches(8, len(feature_order) * row_height + 1.5)
pl.axvline(x=0, color="#999999", zorder=-1)
for pos, i in enumerate(feature_order):
pl.axhline(y=pos, color="#cccccc", lw=0.5, dashes=(1, 5), zorder=-1)
shaps = shap_values[:, i]
values = features[:, i]
N = len(shaps)
nbins = 100
quant = np.round(nbins * (shaps - np.min(shaps)) / (np.max(shaps) - np.min(shaps) + 1e-8))
inds = np.argsort(quant + np.random.randn(N) * 1e-6)
layer = 0
last_bin = -1
ys = np.zeros(N)
for ind in inds:
if quant[ind] != last_bin:
layer = 0
ys[ind] = np.ceil(layer / 2) * ((layer % 2) * 2 - 1)
layer += 1
last_bin = quant[ind]
ys *= 0.9 * (row_height / np.max(ys + 1))
vmin=vmins[i]
vmax=vmaxs[i]
nan_mask = np.isnan(values)
pl.scatter(shaps[nan_mask], pos + ys[nan_mask], color="#777777", vmin=vmin,
vmax=vmax, s=16, alpha=alpha, linewidth=0,
zorder=3, rasterized=len(shaps) > 500)
pl.scatter(shaps[np.invert(nan_mask)], pos + ys[np.invert(nan_mask)],
cmap=colors.red_blue, vmin=vmin, vmax=vmax, s=16,
c=values[np.invert(nan_mask)], alpha=alpha, linewidth=0,
zorder=3, rasterized=len(shaps) > 500)
import matplotlib.cm as cm
m = cm.ScalarMappable(cmap=pl.get_cmap(red_blue))
m.set_array([0, 1])
cb = pl.colorbar(m, ticks=[0, 1], aspect=1000)
cb.set_ticklabels(['feature value low', 'feature value high'])
cb.set_label('feature value', size=12, labelpad=0)
cb.ax.tick_params(labelsize=11, length=0)
cb.set_alpha(1)
cb.outline.set_visible(False)
bbox = cb.ax.get_window_extent().transformed(pl.gcf().dpi_scale_trans.inverted())
cb.ax.set_aspect((bbox.height - 0.9) * 20)
pl.gca().xaxis.set_ticks_position('bottom')
pl.gca().yaxis.set_ticks_position('none')
pl.gca().spines['right'].set_visible(False)
pl.gca().spines['top'].set_visible(False)
pl.gca().spines['left'].set_visible(False)
pl.gca().tick_params(color=axis_color, labelcolor=axis_color)
pl.yticks(range(len(feature_order)), [feature_names[i] for i in feature_order], fontsize=13)
pl.gca().tick_params('x', labelsize=11)
pl.ylim(-1, len(feature_order))
pl.xlabel('shap value', fontsize=13)
if show:
pl.show()
if return_oder:
return feature_order
In [70]:
for run in dict_causes.keys():
if run in sublabels['runId'].values:
print('')
print (run, dict_causes[run])
print('')
inds = np.where(np.array(ids_known)==run)[0]
print ("sum of features' influences from all lumis, 20 the most important")
print('')
sum_shap = np.sum(shap_values[inds,:-1], axis=0)
print (feature_names[np.argsort(sum_shap, axis=0)][:20])
print('')
print("the most common features with big shap values")
print('')
top_sh_values = np.argsort(shap_values[inds,:-1], axis=1)[:,:50].T.reshape(-1)
unique, first_index, counts = np.unique(
top_sh_values,
return_counts=True, return_index=True)
print(feature_names[top_sh_values[np.sort(first_index[counts == len(inds)])]])
print('')
print ('summary plot')
order=summary_plot_custom(shap_values[inds,:], features=X_known[inds, :], feature_names=feature_names,
vmins=vmins, vmaxs=vmaxs, return_oder=True, sort=True)
print ('reference plot for good lumis')
summary_plot_custom(shap_ref[:,order], features=X_ref[:,order], feature_names=feature_names[order],
vmins=vmins[order], vmaxs=vmaxs[order], return_oder=False, sort=False)
channels_shap = pd.DataFrame()
for i in shap_values[inds,:-1]:
pred_for_channnel = {}
for channel in channels_features:
current_sum = 0
for feature in channels_features[channel]:
current_sum += i[list(feature_names).index(feature)]
pred_for_channnel[channel] = current_sum/num_fch[channel]
channels_shap = channels_shap.append(pred_for_channnel, ignore_index=True)
print ('shap for channel for lumi')
print (channels_shap)
print(" ")
print ('shap for channel for run')
print(channels_shap.sum())
print(" ")
print("_"*100)
else:
print (str(run) +' is not found')