In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from IPython import display
import time
import os
import re
LABELS_PATH = '//data/cms2010/labels_v2.pickled'
with open(LABELS_PATH, 'rb') as f:
u = pickle._Unpickler(f)
u.encoding = 'latin1'
sub_labels = u.load()
labels = sub_labels['json_0.txt']
data = pd.read_csv('//data/cms2010//merged_23')
In [2]:
plt.rcParams['axes.facecolor'] = 'white'
plt.rc('grid', linestyle="-", color='gray')
plt.rc('axes', edgecolor='gray')
In [3]:
### technical columns
not_features = [
'_luminosityBlock',
'_run'
]
### columns that correspond to actual features
features = sorted(set(data.columns) - set(not_features))
for f in features:
xs = data[f].values
if np.std(xs) > 0.0:
data[f] = (xs - np.mean(xs)) / np.std(xs)
lumi = np.maximum(
np.maximum(data['_instantLumi_minibias'].get_values(), data['_instantLumi_muons'].get_values()),
data['_instantLumi_photons'].get_values()
)
nonempty = np.where(lumi > 0.0)[0]
data = data.iloc[nonempty]
lumi = lumi[nonempty]
labels = labels[nonempty]
for k in sub_labels:
sub_labels[k] = sub_labels[k][nonempty]
In [4]:
lumi_bad = np.sum(lumi[labels == 0.0])
lumi_good = np.sum(lumi[labels == 1.0])
### By normalizing weights we implicitly define equal probabilities for each class
weights = lumi / np.where(labels == 1.0, lumi_good, lumi_bad)
weights *= lumi.shape[0]
w_bad = np.sum(weights[labels == 0.0])
w_good = np.sum(weights[labels == 1.0])
In [5]:
### utility functions
def insert(keys, d, f):
key = keys[0]
if len(keys) == 1:
d[key] = f
else:
if key not in d:
d[key] = dict()
return insert(keys[1:], d[key], f)
def levels(features, n_levels = 5):
dicts = [features]
levels = list()
for level in range(n_levels):
levels.append(
set([ k for d in dicts for k in d ])
)
dicts = [ d[k] for d in dicts for k in d ]
return levels
def get_feature_groups(feature_list, re_exp):
"""
Retuns:
1. hierachical dictionary feature groups -> feature full name
2. feature levels
3. unprocessed features
"""
features = dict()
rest = list()
n_levels = set()
for f in feature_list:
matches = re.findall(re_exp, f)
if len(matches) == 1:
insert(matches[0], features, f)
n_levels.add(len(matches[0]))
elif len(matches) == 0:
rest.append(f)
else:
raise Exception('Very suspicious feature: %s -> %s' % (f, matches))
assert len(n_levels) == 1
return features, levels(features, n_levels=list(n_levels)[0]), rest
def insert_fake_path(d, level, path = 'general'):
if level == 0:
return { path : d }
else:
r = dict()
for k in d:
r[k] = insert_fake_path(d[k], level - 1, path)
return r
def flatten(a_dict):
for k in a_dict:
if hasattr(a_dict[k], 'keys'):
for path, value in flatten(a_dict[k]):
yield (k, ) + path, value
else:
yield (k, ), a_dict[k]
def merge(dicts):
result = dict()
for d in dicts:
for path, value in flatten(d):
insert(path, result, value)
return result
def flatten_dict(d):
r = dict()
for paths, v in flatten(d):
k = '_'.join(paths)
r[k] = v
return r
def squezze(d, depth = 5, last=2):
dc = d.copy()
if depth - 1 == last:
for k in d:
dc[k] = flatten_dict(d[k])
return d
else:
for k in d:
dc[k] = squezze(d[k], depth-1, last)
return dc
def group(d, level=2):
gd = defaultdict(lambda: list())
for path, k in flatten(d):
gk = path[:level]
gd[gk].append(k)
return gd
particle_f_re = re.compile(r'([a-zA-Z]+)[_]([a-zA-Z]+)[_]([a-zA-Z]+)[_]+(q[12345])[_](\w+)')
particle_features, particle_levels, rest = get_feature_groups(features, particle_f_re)
for level in particle_levels:
print (' '.join(list(level)))
particle_type_f_re = re.compile(r'([a-zA-Z]+)[_]([a-zA-Z]+)[_]([a-zA-Z]+)[_]+([a-zA-Z0-9]+)')
particle_type_features, particle_type_levels, rest = get_feature_groups(rest, particle_type_f_re)
for level in particle_type_levels:
print (' '.join(list(level)))
particle_type_features = insert_fake_path(particle_type_features, level = 2, path='allParticles')
for level in levels(particle_type_features, n_levels=5):
print (' '.join(list(level)))
event_f_re = re.compile(r'([a-zA-Z]+)[_]([a-zA-Z]+)[_]+(\w+)')
event_features, event_levels, rest = get_feature_groups(rest, event_f_re)
for level in event_levels:
print (' '.join(list(level)))
f = insert_fake_path(event_features, level = 1, path='allChannels')
f = insert_fake_path(f, level = 2, path='allParticles')
event_features = f
for level in levels(event_features, n_levels=5):
print (' '.join(list(level)))
stream_f_re = re.compile(r'([a-zA-Z]+)[_]([a-zA-Z]+)')
stream_features, stream_levels, rest = get_feature_groups(rest, stream_f_re)
for level in stream_levels:
print (' '.join(list(level)))
feature_hierarchy = merge([
particle_features, particle_type_features, event_features
])
grouped = group(feature_hierarchy, level=2)
channels_features = dict()
for k in [('muons', 'muons'), ('photons', 'photons'), ('minibias', 'PF'), ('minibias', 'calo')]:
channels_features[k[1]] = grouped[k]
features_together = []
for channel in channels_features:
features_together.extend(channels_features[channel])
In [6]:
indx_train, indx_test = train_test_split(np.arange(data.shape[0], dtype='int32'), stratify=labels, test_size=0.1, random_state = 1)
new_data = np.array(data[features_together])
X_train = new_data[indx_train]
X_test = new_data[indx_test]
y_train = labels[indx_train]
y_test = labels[indx_test]
In [7]:
sub_systems_short = ['json_0.txt',
'json_11.txt',
'json_12.txt',
'json_13.txt',
'json_15.txt',
'json_16.txt',
'json_17.txt',
'json_18.txt',
'json_19.txt',
'json_21.txt',
'json_22.txt',
'json_23.txt',
'json_24.txt']
subsytem_descriptions_short = dict(
[
('json_0.txt', 'inclusive global label')
] + [
('json_%d.txt' % i, desc)
for i, desc in zip(range(15, 23), [
'Rpc muon chambers',
'Csc muon chambers',
'Dt muon chambers',
'HCAL ',
'ECAL ',
'Preshower',
'Strip tracker',
'Pix tracker'
])
] + [
('json_%d.txt' % i, desc)
for i, desc in zip(list(range(11, 15)) + list(range(23, 25)), [
'Muons ',
'Jets ',
'Electrons/Photons',
'Tracking',
'High level trigger',
'L1 trigger'
])
]
)
In [8]:
rf_test = np.load('//data/cms2010//rf_test.npy')
ch_probas_test = pd.read_csv('//data/cms2010//lime_channels_prob_test')
In [9]:
plt.figure(figsize=(8, 8))
for k in ch_probas_test:
ch_proba = ch_probas_test[k]
plt.plot([0, 1], [0, 1], '--', color='black')
fpr, tpr, _ = roc_curve(y_test, ch_proba)
auc_score = auc(fpr, tpr, reorder=True)
plt.plot(fpr, tpr, label='Deterministic output, AUC for %s : %.3lf' % (k, auc_score))
print ('AUC for %s : %.3lf' % (k, auc_score))
fpr, tpr, _ = roc_curve(y_test, rf_test, sample_weight=weights[indx_test])
auc_score = auc(fpr, tpr, reorder=True)
plt.plot(fpr, tpr, label='RF probas : %.3lf' % auc_score)
plt.legend(loc='lower right')
plt.title('ROC curve for the network', fontsize=24)
plt.xlabel('FPR', fontsize=20)
plt.ylabel('TPR', fontsize=20)
plt.show()
In [10]:
for k in ch_probas_test:
proba = ch_probas_test[k]
colors = ['#ed0021ff','#01017f']
plt.figure(figsize=(7, 7))
plt.hist([
proba[labels[indx_test] == 0.0],
proba[labels[indx_test] == 1.0]
],bins=20, weights=[
weights[indx_test][labels[indx_test] == 0.0] / np.sum(weights[indx_test][labels[indx_test] == 0.0]),
weights[indx_test][labels[indx_test] == 1.0] / np.sum(weights[indx_test][labels[indx_test] == 1.0])
], histtype='step', label=['Anomalous lumisections', 'Good lumisections'], lw=3,
alpha=1, fill= False, color=colors)
plt.legend(loc='upper center', fontsize=20)
plt.title('%s channel' % k, fontsize=30, color = 'black' )
plt.ylabel('luminosity fraction', fontsize=20, color = 'black')
plt.xlabel(r'subnetwork output', fontsize=20, color = 'black')
plt.xticks(fontsize=20, color = 'black')
plt.yticks(fontsize=20, color = 'black')
plt.grid(False)
plt.axis('on')
plt.show()
proba = ch_probas_test.min(axis=1)
plt.figure(figsize=(7, 7))
plt.hist([
proba[labels[indx_test] == 0.0],
proba[labels[indx_test] == 1.0]
],bins=20, weights=[
weights[indx_test][labels[indx_test] == 0.0] / np.sum(weights[indx_test][labels[indx_test] == 0.0]),
weights[indx_test][labels[indx_test] == 1.0] / np.sum(weights[indx_test][labels[indx_test] == 1.0])
], histtype='step', label=['Anomalous lumisections', 'Good lumisections'], lw=3,
alpha=1, fill= False, color=colors)
plt.legend(loc='upper center', fontsize=20)
plt.title('minproba_test' , fontsize=30, color = 'black' )
plt.ylabel('luminosity fraction', fontsize=20, color = 'black')
plt.xlabel(r'min channel proba', fontsize=20, color = 'black')
plt.xticks(fontsize=20, color = 'black')
plt.yticks(fontsize=20, color = 'black')
plt.grid(False)
plt.axis('on')
plt.show()
proba = rf_test
plt.figure(figsize=(7, 7))
plt.hist([
proba[labels[indx_test] == 0.0],
proba[labels[indx_test] == 1.0]
],bins=20, weights=[
weights[indx_test][labels[indx_test] == 0.0] / np.sum(weights[indx_test][labels[indx_test] == 0.0]),
weights[indx_test][labels[indx_test] == 1.0] / np.sum(weights[indx_test][labels[indx_test] == 1.0])
], histtype='step', label=['Anomalous lumisections', 'Good lumisections'], lw=3,
alpha=1, fill= False, color=colors)
plt.legend(loc='upper center', fontsize=20)
plt.title('RF' , fontsize=30, color = 'black' )
plt.ylabel('luminosity fraction', fontsize=20, color = 'black')
#plt.xlabel(r'min subnetwork output', fontsize=20, color = 'black')
plt.xticks(fontsize=20, color = 'black')
plt.yticks(fontsize=20, color = 'black')
plt.grid(False)
plt.axis('on')
plt.show()
In [11]:
for k in ch_probas_test:
proba = ch_probas_test[k]
proba = np.where(proba<0, 0, 1)
colors = ['#ed0021ff','#01017f']
plt.figure(figsize=(7, 7))
plt.hist([
proba[labels[indx_test] == 0.0],
proba[labels[indx_test] == 1.0]
],bins=20, weights=[
weights[indx_test][labels[indx_test] == 0.0] / np.sum(weights[indx_test][labels[indx_test] == 0.0]),
weights[indx_test][labels[indx_test] == 1.0] / np.sum(weights[indx_test][labels[indx_test] == 1.0])
], histtype='step', label=['Anomalous lumisections', 'Good lumisections'], lw=3,
alpha=1, fill= False, color=colors)
plt.legend(loc='upper center', fontsize=20)
plt.title('%s channel' % k, fontsize=30, color = 'black' )
plt.ylabel('luminosity fraction', fontsize=20, color = 'black')
plt.xlabel(r'subnetwork output', fontsize=20, color = 'black')
plt.xticks(fontsize=20, color = 'black')
plt.yticks(fontsize=20, color = 'black')
plt.grid(False)
plt.axis('on')
plt.show()
In [12]:
metric = roc_auc_score
met_name = 'roc_auc_score'
channels = ['muons', 'photons', 'PF', 'calo']
sub_systems = sorted(sub_labels.keys())
aucs = np.ones(shape=(len(channels), len(sub_systems_short))) / 2.0
for i, channel in enumerate(channels):
for j, sub_system in enumerate(sub_systems_short):
aucs[i, j] = metric(sub_labels[sub_system][indx_test], ch_probas_test[channel], sample_weight=weights[indx_test])
fig = plt.figure(figsize=(36, 14))
im = plt.imshow(aucs, interpolation='None', aspect=1, cmap = 'jet')
plt.colorbar(im, shrink=0.75).ax.tick_params(labelsize=40)
plt.xticks(np.arange(len(sub_systems_short)), [subsytem_descriptions_short[k]+" ("+str(len(np.where(sub_labels[k][indx_test]==0)[0]))+") " for k in sub_systems_short], rotation=90, fontsize=40, color = 'black')
plt.yticks(np.arange(4), [ "%s" % g for g in channels ], fontsize=40, color = 'black')
plt.title('ROC AUC score'+' of subnetwork scores against ground truth labels by subsystem', fontsize=50)
plt.grid(False)
plt.tight_layout()
plt.show()
In [13]:
probas_test = np.load('//data/cms2010/probas_test_2010.npy').item()
In [14]:
plt.figure(figsize=(8, 8))
sum_pred = np.zeros((len(indx_test)))
log_and = np.ones((len(indx_test)))
for k in channels_features:
common_proba = probas_test[k]#(indx_test)
sum_pred += common_proba
log_and*= common_proba
plt.plot([0, 1], [0, 1], '--', color='black')
fpr, tpr, _ = roc_curve(labels[indx_test], common_proba, sample_weight=weights[indx_test])
auc_score = auc(fpr, tpr, reorder=True)
plt.plot(fpr, tpr, label='Deterministic output, AUC for %s : %.3lf' % (k, auc_score))
print ('Deterministic output, AUC for %s : %.3lf' % (k, auc_score))
f_and = np.exp(sum_pred - 4.)
fpr, tpr, _ = roc_curve(labels[indx_test], f_and)
auc_score = auc(fpr, tpr, reorder=True)
plt.plot(fpr, tpr, label='Deterministic output, AUC fuzzy_and : %.3lf' % auc_score)
plt.legend(loc='lower right')
plt.title('ROC curve for the network', fontsize=24)
plt.xlabel('FPR', fontsize=20)
plt.ylabel('TPR', fontsize=20)
plt.show()
In [15]:
minproba_test = np.min(np.vstack((probas_test['PF'], probas_test['calo'],
probas_test['muons'], probas_test["photons"])), axis=0)
In [16]:
indx = indx_test
probas = probas_test
minproba = minproba_test
sum_pred = np.zeros((len(indx)))
for k in probas_test:
proba = probas[k]
sum_pred += proba
colors = ['#ed0021ff','#01017f']
plt.figure(figsize=(7, 7))
plt.hist([
proba[labels[indx] == 0.0],
proba[labels[indx] == 1.0]
],bins=20, range=(0, 1), weights=[
weights[indx][labels[indx] == 0.0] / np.sum(weights[indx][labels[indx] == 0.0]),
weights[indx][labels[indx] == 1.0] / np.sum(weights[indx][labels[indx] == 1.0])
], histtype='step', label=['Anomalous lumisections', 'Good lumisections'], lw=3,
alpha=1, fill= False, color=colors)
plt.legend(loc='upper center', fontsize=20)
plt.title('%s channel' % k, fontsize=30, color = 'black' )
plt.ylabel('luminosity fraction', fontsize=20, color = 'black')
plt.xlabel(r'subnetwork output', fontsize=20, color = 'black')
plt.xticks(fontsize=20, color = 'black')
plt.yticks(fontsize=20, color = 'black')
plt.grid(False)
plt.axis('on')
plt.figure(figsize=(7, 7))
f_and = np.exp(sum_pred - 4.)
plt.hist([
f_and[labels[indx] == 0.0],
f_and[labels[indx] == 1.0]
],bins=20, range=(0, 1), weights=[
weights[indx][labels[indx] == 0.0] / np.sum(weights[indx][labels[indx] == 0.0]),
weights[indx][labels[indx] == 1.0] / np.sum(weights[indx][labels[indx] == 1.0])
], histtype='step', label=['Anomalous lumisections', 'Good lumisections'], lw=3,
alpha=1, fill= False, color=colors)
plt.legend(loc='upper center', fontsize=20)
plt.title("Fuzzy and", fontsize=30, color = 'black' )
plt.ylabel('luminosity fraction', fontsize=20, color = 'black')
plt.xlabel(r'network output', fontsize=20, color = 'black')
plt.xticks(fontsize=20, color = 'black')
plt.yticks(fontsize=20, color = 'black')
plt.grid(False)
plt.axis('on')
plt.show()
proba = minproba
plt.figure(figsize=(7, 7))
plt.hist([
proba[labels[indx] == 0.0],
proba[labels[indx] == 1.0]
],bins=20, range=(0, 1), weights=[
weights[indx][labels[indx] == 0.0] / np.sum(weights[indx][labels[indx] == 0.0]),
weights[indx][labels[indx] == 1.0] / np.sum(weights[indx][labels[indx] == 1.0])
], histtype='step', label=['Anomalous lumisections', 'Good lumisections'], lw=3,
alpha=1, fill= False, color=colors)
plt.legend(loc='upper center', fontsize=20)
plt.title('minproba_test' , fontsize=30, color = 'black' )
plt.ylabel('luminosity fraction', fontsize=20, color = 'black')
plt.xlabel(r'min subnetwork output', fontsize=20, color = 'black')
plt.xticks(fontsize=20, color = 'black')
plt.yticks(fontsize=20, color = 'black')
plt.grid(False)
plt.axis('on')
plt.show()
In [17]:
metric = roc_auc_score
met_name = 'roc_auc_score'
probas = probas_test
indx = indx_test
channels = ['muons', 'photons', 'PF', 'calo']
sub_systems = sorted(sub_labels.keys())
aucs = np.ones(shape=(len(channels), len(sub_systems_short))) / 2.0
for i, channel in enumerate(channels):
for j, sub_system in enumerate(sub_systems_short):
try:
aucs[i, j] = metric(sub_labels[sub_system][indx_test], probas[channel], sample_weight=weights[indx_test])
except Exception as e:
print (e)
fig = plt.figure(figsize=(36, 14))
im = plt.imshow(aucs, interpolation='None', aspect=1, cmap = 'jet')
plt.colorbar(im, shrink=0.75).ax.tick_params(labelsize=40)
plt.xticks(np.arange(len(sub_systems_short)), [subsytem_descriptions_short[k]+" ("+str(len(np.where(sub_labels[k][indx_test]==0)[0]))+") " for k in sub_systems_short], rotation=90, fontsize=40, color = 'black')
plt.yticks(np.arange(4), [ "%s" % g for g in channels ], fontsize=40, color = 'black')
plt.yticks(np.arange(4), [ "%s" % g for g in channels ], fontsize=40, color = 'black')
plt.title('ROC AUC score'+' of subnetwork scores against ground truth labels by subsystem', fontsize=50)
plt.grid(False)
plt.tight_layout()
plt.show()
In [18]:
random_ch = np.load('//data/cms2010/random_ch.npy')
In [19]:
for k in range(4):
proba = random_ch[:,k]
colors = ['#ed0021ff','#01017f']
plt.figure(figsize=(7, 7))
plt.hist([
proba[labels[indx] == 0.0],
proba[labels[indx] == 1.0]
],bins=20, range=(0, 1), weights=[
weights[indx][labels[indx] == 0.0] / np.sum(weights[indx][labels[indx] == 0.0]),
weights[indx][labels[indx] == 1.0] / np.sum(weights[indx][labels[indx] == 1.0])
], histtype='step', label=['Anomalous lumisections', 'Good lumisections'], lw=3,
alpha=1, fill= False, color=colors)
plt.legend(loc='upper center', fontsize=20)
plt.title('%s channel' % k, fontsize=30, color = 'black' )
plt.ylabel('luminosity fraction', fontsize=20, color = 'black')
plt.xlabel(r'subnetwork output', fontsize=20, color = 'black')
plt.xticks(fontsize=20, color = 'black')
plt.yticks(fontsize=20, color = 'black')
plt.grid(False)
plt.axis('on')
In [20]:
metric = roc_auc_score
met_name = 'roc_auc_score'
probas = probas_test
indx = indx_test
channels = ['muons', 'photons', 'PF', 'calo']
sub_systems = sorted(sub_labels.keys())
aucs = np.ones(shape=(len(channels), len(sub_systems_short))) / 2.0
for i, channel in enumerate(channels):
for j, sub_system in enumerate(sub_systems_short):
try:
aucs[i, j] = metric(sub_labels[sub_system][indx_test], random_ch[:,i], sample_weight=weights[indx_test])
except Exception as e:
print (e)
fig = plt.figure(figsize=(36, 14))
im = plt.imshow(aucs, interpolation='None', aspect=1, cmap = 'jet')
plt.colorbar(im, shrink=0.75).ax.tick_params(labelsize=40)
plt.xticks(np.arange(len(sub_systems_short)), [subsytem_descriptions_short[k]+" ("+str(len(np.where(sub_labels[k][indx_test]==0)[0]))+") " for k in sub_systems_short], rotation=90, fontsize=40, color = 'black')
plt.yticks(np.arange(4), [ "%s" % g for g in channels ], fontsize=40, color = 'black')
plt.yticks(np.arange(4), [ "%s" % g for g in channels ], fontsize=40, color = 'black')
plt.title('ROC AUC score'+' of subnetwork scores against ground truth labels by subsystem', fontsize=50)
plt.grid(False)
plt.tight_layout()
plt.show()
In [ ]: