Prepearing data


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import shap

import colors
# load JS visualization code to notebook
shap.initjs()



In [2]:
data_features = pd.read_hdf('/Users/OlgaKo/Desktop/from_lab14/data_features_JetHT.hdf5', "data").reset_index(drop=True)
labels = 1-pd.read_hdf('/Users/OlgaKo/Desktop/from_lab14/labels_JetHT.hdf5', 'labels').reset_index(drop=True)
sublabels = pd.read_hdf('/Users/OlgaKo/Desktop/from_lab14/data_not_f_JetHT.hdf5', "data").reset_index(drop=True)

In [3]:
data_features.shape, sublabels.shape


Out[3]:
((162990, 2470), (162990, 20))

some runs with given cause of anomaly


In [61]:
dict_causes = {
 273150:" [[61, 64], [66, 75]],     Ecal excluded",
 273296:" [[1, 94]],    Pixe excluded",
 273426:" [[1, 59], [64, 64], [66, 69]],inefficeincy in EE",
 273445:" [[5, 6]],  tracker off  seen by many systems",
 274142:" [[99, 100]],  pixel and tracker off",
 274157:" [[103, 534]],  EB minus has region with low efficiency",
 274282:" [[86, 88]],  tracker is off --> particle flow object are affected",
 275326:" [[1, 169]],  sistrip excl",
 275757:" [[104, 122]],  low DCS: HBHE. nothing on PF plot",
 275758:" [[1, 4]],   part of hcal not available --> JetMET reconstruction suffers",
 275764:" [[1, 31]],   pixel off",
 275766:" [[1, 23], [25, 60]],  lower tracker efficiency",
 275768:" [[1, 79]],  lower tracker efficiency",
 275769:" [[1, 22]],  pixel not in Data Acquisition",
 275781:" [[1, 29]],  EB+16, +17, +18 are excluded due to a problem with VME crateS206h",
 275783:" [[1, 1634]],  strip EXCL",
 275838:" [[1, 51]],  strip EXCL",
 275922:" [[4, 6]],  low stat",
 276064:" [[1, 22]],  hot trigger tower seen in EE-07 (high occupancy and Et TP) which cause  ~100% deadtime",
 276071:" [[1, 22]],   strip EXCL",
 276095:" [[1, 5]],   low stat",
 276237:" [[1, 5]],   ECAL, HCAL, PIXEL excluded",
 276453:" [[1, 8], [10, 125]],  EB-17 (FED 626): was excluded (because of cooling failure)",
 276455:" [[1, 401]],  strip EXCL",
 276456:" [[1, 182]],  strip EXCL",
 276457:" [[1, 19]],  sistrip not in DAQ",
 277217:" [[12, 14]],  Short collision run with strip in error in DCS and pixel in error in DAQ. For physically meaningful lumisections 33-47, the total rate is zero. L1T flags marked as bad due to this",
 277933:" [[1, 42]],  ECAL EE+09 FED 648 removed from all LS in the run because it was causing 100% dead time. EE+09 was not masked, so all LS in this run are bad",
 278309:" [[1, 10]],  EE-04 FED 607 TT disabled in LS [1-10] according to express dataset (LS# 10)",
 278821:" [[1, 33], [36, 37]],  FED652 in error, EE+04 is off",
 279028:" [[1, 17]],  strip EXCL",
 279995:" [[1, 8]],  hcal water colling issues",
 280002:" [[1, 111]],  ecal excluded",
 280006:" [[1, 68]],  EB-11 token ring missing",
 280007:" [[1, 36]],  Low voltage channel broken in EB",
 280239:" [[1, 9]],  strip EXCL",
 280241:" [[1, 11]],  strip EXCL",
 281663:" [[61, 172]],  Timing shifted by 5ns due to TCDS* problem ",
 281674:" [[1, 45]],  Timing shifted by 5ns due to TCDS* problem",
 281680:" [[1, 31]],  Timing shifted by 5ns due to TCDS* problem",
 281974:" [[79, 85]],  pixel High voltage OFF",
 282408:" [[80, 191]],  strip EXCL",
 282707:" [[79, 81]],  pixel High voltage OFF",
 282796:" [[80, 82]],  pixel High Voltage off",
 282921:" [[1, 152]],       strip EXCL"

    
}

In [5]:
indx_train = np.arange(data_features.shape[0]-int(data_features.shape[0]/5), dtype='int32')
indx_test = np.arange(data_features.shape[0]-int(data_features.shape[0]/5),data_features.shape[0], dtype='int32')

In [6]:
indx_known = sublabels[sublabels["runId"].isin(dict_causes.keys())].index.tolist()
ids_known = sublabels[['runId', 'lumiId']].iloc[indx_known]

In [7]:
indx_train_minus = [s for s in indx_train if (s not in indx_known)]

In [8]:
num_good = np.sum(labels)
num_bad = len(labels)-np.sum(labels)

weights = 0.5 / np.where(labels == 0.0, num_good, num_bad)
weights *= len(labels)

In [9]:
y_train = np.array(labels.iloc[indx_train_minus], 'float32')
y_known = np.array(labels.iloc[indx_known], 'float32')
y_test = np.array(labels.iloc[indx_test], 'float32')

X_train = np.array(data_features.iloc[indx_train_minus], 'float32')
X_known = np.array(data_features.iloc[indx_known], 'float32')
X_test = np.array(data_features.iloc[indx_test], 'float32')

weights_train = weights[indx_train_minus]
weights_test = weights[indx_test]

In [10]:
feature_names = data_features.columns

In [11]:
Muon_features = [s for s in feature_names if (s[:3] == 'qMu')]# and (s[3:7] != 'Cosm')]
Pho_features = [s for s in feature_names if s[:4] == 'qPho']
Cal_features = [s for s in feature_names if s[:4] == 'qCal']
PF_features = [s for s in feature_names if s[:3] == 'qPF']

channels_features = dict()

channels_features['muons'] = Muon_features
channels_features['photons'] = Pho_features
channels_features['PF'] = PF_features
channels_features['calo'] = Cal_features

In [12]:
num_fch = dict()
for g, fs in channels_features.items():
    num_fch[g]=len(fs)

In [13]:
num_fch


Out[13]:
{'PF': 878, 'calo': 280, 'muons': 439, 'photons': 224}

In [14]:
vmins = []
vmaxs = []

for f in data_features.columns:
    values = data_features[f].values
    # trim the color range, but prevent the color range from collapsing
    vmin = np.nanpercentile(values, 5)
    vmax = np.nanpercentile(values, 95)
    if vmin == vmax:
        vmin = np.nanpercentile(values, 1)
        vmax = np.nanpercentile(values, 99)
        if vmin == vmax:
            vmin = np.min(values)
            vmax = np.max(values)
    vmins.append(vmin)
    vmaxs.append(vmax)

In [51]:
vmins = np.array(vmins)
vmaxs = np.array(vmaxs)

In [15]:
X_ref=X_test[np.random.choice(np.where(y_test==1)[0], size=100, replace=False)]

train any classifier


In [16]:
import xgboost as xgb
xgb_clf  = xgb.XGBClassifier(n_estimators=80, max_depth=10, learning_rate=0.1, silent=False)
xgb_clf.fit(X_train, y_train)


Out[16]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=80,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)

In [18]:
model = xgb_clf

SHAP to explain predictions


In [22]:
shap_values = shap.TreeExplainer(model).shap_values(X_known)
shap_ref = shap.TreeExplainer(model).shap_values(X_ref)

In [69]:
def summary_plot_custom(shap_values, features=None, feature_names=None, max_display=20,
                 vmins=None, vmaxs=None, return_oder=True,
                 axis_color="#333333", title=None, alpha=1, show=True, sort=True,
                 color_bar=True, auto_size_plot=True):

    num_features = shap_values.shape[1]


    if sort:
        # order features by the sum of their effect magnitudes
        feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0))
        feature_order = feature_order[-min(max_display, len(feature_order)):]
    else:
        feature_order = np.arange(min(max_display, num_features))

        
    row_height = 0.4
    if auto_size_plot:
        pl.gcf().set_size_inches(8, len(feature_order) * row_height + 1.5)
    pl.axvline(x=0, color="#999999", zorder=-1)


    for pos, i in enumerate(feature_order):
        pl.axhline(y=pos, color="#cccccc", lw=0.5, dashes=(1, 5), zorder=-1)
        shaps = shap_values[:, i]
        values = features[:, i]


        N = len(shaps)
        nbins = 100
        quant = np.round(nbins * (shaps - np.min(shaps)) / (np.max(shaps) - np.min(shaps) + 1e-8))
        inds = np.argsort(quant + np.random.randn(N) * 1e-6)
        layer = 0
        last_bin = -1
        ys = np.zeros(N)
        for ind in inds:
            if quant[ind] != last_bin:
                layer = 0
            ys[ind] = np.ceil(layer / 2) * ((layer % 2) * 2 - 1)
            layer += 1
            last_bin = quant[ind]
        ys *= 0.9 * (row_height / np.max(ys + 1))

        vmin=vmins[i]
        vmax=vmaxs[i]

        nan_mask = np.isnan(values)
        
        pl.scatter(shaps[nan_mask], pos + ys[nan_mask], color="#777777", vmin=vmin,
                   vmax=vmax, s=16, alpha=alpha, linewidth=0,
                   zorder=3, rasterized=len(shaps) > 500)
        
        
        pl.scatter(shaps[np.invert(nan_mask)], pos + ys[np.invert(nan_mask)],
                   cmap=colors.red_blue, vmin=vmin, vmax=vmax, s=16,
                   c=values[np.invert(nan_mask)], alpha=alpha, linewidth=0,
                   zorder=3, rasterized=len(shaps) > 500)



    import matplotlib.cm as cm
    m = cm.ScalarMappable(cmap=pl.get_cmap(red_blue))
    m.set_array([0, 1])
    cb = pl.colorbar(m, ticks=[0, 1], aspect=1000)
    cb.set_ticklabels(['feature value low', 'feature value high'])
    cb.set_label('feature value', size=12, labelpad=0)
    cb.ax.tick_params(labelsize=11, length=0)
    cb.set_alpha(1)
    cb.outline.set_visible(False)
    bbox = cb.ax.get_window_extent().transformed(pl.gcf().dpi_scale_trans.inverted())
    cb.ax.set_aspect((bbox.height - 0.9) * 20)

    pl.gca().xaxis.set_ticks_position('bottom')
    pl.gca().yaxis.set_ticks_position('none')
    pl.gca().spines['right'].set_visible(False)
    pl.gca().spines['top'].set_visible(False)
    pl.gca().spines['left'].set_visible(False)
    pl.gca().tick_params(color=axis_color, labelcolor=axis_color)
    pl.yticks(range(len(feature_order)), [feature_names[i] for i in feature_order], fontsize=13)
    pl.gca().tick_params('x', labelsize=11)
    pl.ylim(-1, len(feature_order))
    pl.xlabel('shap value', fontsize=13)
    if show:
        pl.show()
    
    if return_oder:
        return feature_order

In [70]:
for run in dict_causes.keys():
    if run in sublabels['runId'].values:
        print('')
        print (run, dict_causes[run])
        print('')
            
        inds = np.where(np.array(ids_known)==run)[0]

        print ("sum of features' influences from all lumis, 20 the most important")
        print('')
        sum_shap = np.sum(shap_values[inds,:-1], axis=0)
        print (feature_names[np.argsort(sum_shap, axis=0)][:20])
        print('')
        
        print("the most common features with big shap values")
        print('')
        top_sh_values = np.argsort(shap_values[inds,:-1], axis=1)[:,:50].T.reshape(-1)
        unique, first_index, counts = np.unique(
            top_sh_values,
        return_counts=True, return_index=True)
        print(feature_names[top_sh_values[np.sort(first_index[counts == len(inds)])]])
        print('')
        
        print ('summary plot')
        order=summary_plot_custom(shap_values[inds,:], features=X_known[inds, :], feature_names=feature_names,
                               vmins=vmins, vmaxs=vmaxs, return_oder=True, sort=True)
        
        print ('reference plot for good lumis')
        summary_plot_custom(shap_ref[:,order], features=X_ref[:,order], feature_names=feature_names[order],
                               vmins=vmins[order], vmaxs=vmaxs[order], return_oder=False, sort=False)
        
        channels_shap = pd.DataFrame()
        for i in shap_values[inds,:-1]:
            pred_for_channnel = {}
            for channel in channels_features:
                current_sum = 0
                for feature in channels_features[channel]:
                    current_sum += i[list(feature_names).index(feature)]
                pred_for_channnel[channel] = current_sum/num_fch[channel]
            channels_shap = channels_shap.append(pred_for_channnel, ignore_index=True)

        print ('shap for channel for lumi')
        print (channels_shap)
        print(" ")
        
        print ('shap for channel for run')
        print(channels_shap.sum())
        print(" ")
        print("_"*100)
        
    else:
        print (str(run) +' is not found')


276237 is not found
273426 is not found

(278821, ' [[1, 33], [36, 37]],  FED652 in error, EE+04 is off')

sum of features' influences from all lumis, 20 the most important

Index([u'qCCPhi5x5_0', u'qCCPhi5x5_5', u'qPFMetPhi_1', u'qCCEn5x5_5',
       u'qCCEta5x5_0', u'qCalJet1Pt_4', u'qEEenergy_3', u'qEEix_4',
       u'qESenergy_4', u'qEBchi2_1', u'qEEiy_0', u'qCalMETBEFOPhi_3',
       u'qCalMETPhi_3', u'qCalMETBEFOPhi_1', u'qPreShEn_5', u'qHBHEtime_5',
       u'qCalJet2Pt_5', u'qEBtime_3', u'qPreShEta_5', u'qPFJet8CHS1Eta_1'],
      dtype='object')

the most common features with big shap values

Index([u'qCCPhi5x5_0', u'qCCPhi5x5_5', u'qCCEn5x5_5', u'qCCEta5x5_0',
       u'qEEix_4', u'qESenergy_4', u'qEBchi2_1', u'qEEiy_0', u'qHBHEtime_5',
       u'qPreShEn_5', u'qCalJet2Pt_5', u'qPFJetEta_1', u'qPreShPhi_0',
       u'qCalJetEn_0', u'qPFJet8CHSEta_3', u'qPFJet8CHSPt_3'],
      dtype='object')

summary plot
reference plot for good lumis
shap for channel for lumi
          PF      calo     muons   photons
0  -0.000598 -0.001788  0.000524  0.000061
1  -0.000078 -0.001764 -0.000044 -0.000226
2  -0.000517 -0.002080  0.000039  0.000312
3  -0.000260 -0.002363  0.000691  0.000301
4  -0.000585 -0.001931  0.000054  0.000038
5  -0.000113 -0.006109  0.000233  0.000277
6  -0.000584  0.002204 -0.000035 -0.000130
7   0.000008 -0.002015 -0.000227  0.000229
8  -0.000351 -0.000415  0.000072  0.000366
9  -0.000200 -0.002013  0.000267  0.000294
10 -0.000684 -0.001810  0.000517  0.000195
11 -0.000290 -0.002200  0.000354  0.000184
12 -0.000073 -0.002373  0.000368  0.000039
13  0.000456 -0.002198  0.000727  0.000094
14 -0.000196 -0.000619  0.000036  0.000231
15  0.000031 -0.002056  0.000459  0.000125
16 -0.000449 -0.001790  0.000040  0.000088
17  0.000418 -0.001797  0.000665  0.000099
18  0.000406 -0.001966  0.000693  0.000129
19 -0.000077 -0.002025  0.000089  0.000020
20 -0.000580 -0.001687  0.000720 -0.000134
21 -0.000162 -0.001762  0.000953  0.000067
22 -0.000003 -0.002410  0.000365  0.000149
23 -0.000134 -0.002155  0.000144  0.000197
24 -0.000023 -0.001708  0.000624  0.000086
25 -0.000041 -0.002187  0.000783  0.000246
26 -0.000183 -0.002408  0.000126 -0.000171
27  0.000598 -0.002100  0.000305  0.000192
28 -0.000024 -0.002444  0.000878  0.000225
29  0.000589 -0.002332 -0.000074  0.000162
30  0.000340 -0.001825  0.000170  0.000152
31 -0.000525 -0.001655  0.000843  0.000137
32 -0.000620 -0.002100  0.000309  0.000242
 
shap for channel for run
PF        -0.004506
calo      -0.063880
muons      0.011669
photons    0.004276
dtype: float64
 
____________________________________________________________________________________________________
282408 is not found
282921 is not found

(275757, ' [[104, 122]],  low DCS: HBHE. nothing on PF plot')

sum of features' influences from all lumis, 20 the most important

Index([u'qPFJet4CHSEta_3', u'qEEchi2_3', u'qEEchi2_1', u'qPFJetEIEta_3',
       u'qEEchi2_0', u'qEBchi2_0', u'qHFiphi_1', u'qESenergy_3', u'qEEtime_3',
       u'qCCEta5x5_1', u'qPFMetPt_5', u'qPFMetPhi_4', u'qESenergy_4',
       u'qPreShEn_0', u'qESenergy_0', u'qCCEta5x5_5', u'qEBtime_3',
       u'qHBHEtime_3', u'qCalJet1Pt_0', u'qPFJetEta_3'],
      dtype='object')

the most common features with big shap values

Index([u'qPFJet4CHSEta_3', u'qEEchi2_3', u'qEEchi2_1', u'qPFJetEIEta_3',
       u'qESenergy_3', u'qEEchi2_0', u'qEBchi2_0', u'qHFiphi_1', u'qEEtime_3',
       u'qPFMetPt_5', u'qCCEta5x5_1', u'qPreShEn_0', u'qHBHEtime_3',
       u'qESenergy_4', u'qCCEta5x5_5', u'qESenergy_0', u'qEBtime_3',
       u'qSCEta5x5_4', u'qPreShYEta_5'],
      dtype='object')

summary plot
reference plot for good lumis
shap for channel for lumi
          PF      calo     muons   photons
0  -0.005485 -0.000995  0.000073 -0.000838
1  -0.004827  0.000368 -0.000181 -0.000726
2  -0.005052  0.000410 -0.000195 -0.000683
3  -0.004729 -0.000168  0.000102 -0.000124
4  -0.004679  0.000179 -0.000052 -0.000536
5  -0.005289  0.000420  0.000023 -0.000961
6  -0.004851  0.000445  0.000257 -0.000890
7  -0.004297  0.000198  0.000015 -0.000843
8  -0.004684  0.000245  0.000141 -0.000248
9  -0.004614  0.000361  0.000159 -0.000199
10 -0.005009 -0.000051  0.000283  0.000019
11 -0.004981  0.000356  0.000661 -0.000304
12 -0.005329  0.001099 -0.000074 -0.000666
13 -0.005405  0.000923  0.000374 -0.000823
14 -0.004767  0.000752  0.000353 -0.000311
15 -0.005052  0.001016  0.000379 -0.000259
16 -0.004907  0.000305  0.000403 -0.000431
 
shap for channel for run
PF        -0.083958
calo       0.005863
muons      0.002721
photons   -0.008822
dtype: float64
 
____________________________________________________________________________________________________

(275758, ' [[1, 4]],   part of hcal not available --> JetMET reconstruction suffers')

sum of features' influences from all lumis, 20 the most important

Index([u'qPFJet4CHSEta_3', u'qEEchi2_3', u'qEEchi2_1', u'qEEchi2_0',
       u'qEBchi2_0', u'qPFJetEIEta_3', u'qHFiphi_1', u'qESenergy_3',
       u'qEEtime_3', u'qCCEta5x5_1', u'qPFMetPhi_1', u'qESenergy_4',
       u'qESenergy_5', u'qESenergy_0', u'qPFJetEta_3', u'qCCEta5x5_5',
       u'qPFMetPhi_5', u'qEBtime_3', u'qMuCosmEta_1', u'qCalJet1Pt_0'],
      dtype='object')

the most common features with big shap values

Index([u'qPFJet4CHSEta_3', u'qEEchi2_3', u'qEEchi2_1', u'qEEchi2_0',
       u'qPFJetEIEta_3', u'qEBchi2_0', u'qESenergy_3', u'qEEtime_3',
       u'qESenergy_0', u'qCCEta5x5_1', u'qESenergy_4', u'qESenergy_5',
       u'qSCEta5x5_4', u'qPFMetPhi_5', u'qHBHEtime_3', u'qEEenergy_4',
       u'qEBtime_3', u'qCCEta5x5_5', u'qgedPhosigmaeta__4'],
      dtype='object')

summary plot
reference plot for good lumis