In [1]:
import pandas
import numpy
import os
In [2]:
CHUNKSIZE = 5000
def concat_files(folder, names, outfile):
head = True
mode = 'w'
for name in names:
pd_iterator = pandas.read_csv(os.path.join(folder, name), sep='\t', iterator=True, chunksize=CHUNKSIZE)
for pd in pd_iterator:
pd.to_csv(outfile, header=head, index=False, sep='\t', mode=mode)
head = False
mode = 'a'
In [3]:
from rep.utils import train_test_split_group
def prepare_data(signal_data, bck_data, group_column, random_state=13):
ds_train_signal, ds_test_signal = train_test_split_group(signal_data[group_column], signal_data,
train_size=0.5, test_size=0.5,
random_state=random_state)
ds_train_bck, ds_test_bck = train_test_split_group(bck_data[group_column], bck_data,
train_size=0.5, test_size=0.5,
random_state=random_state)
return ds_train_signal, ds_train_bck, ds_test_signal, ds_test_bck
In [ ]:
def statistic_length(data):
return {'Events': len(numpy.unique(data['unique'])), 'SVR': len(data)}
In [4]:
# def statistic_length(data):
# events_ids = data['mode'] * 100000000 + data['event_number']
# return {'Events': len(numpy.unique(events_ids)), 'SVR': len(data)}
In [ ]:
from collections import defaultdict
def result_statistic(models, modes, data, thresholds, rates, total_events):
# dict((name, rate): dict(mode: eff))
modes_eff = defaultdict(OrderedDict)
statistic = defaultdict(list)
for name, cl in models.items():
for mode in modes:
sig_mode = data[data['mode'] == mode]
if len(sig_mode) <= 0:
continue
statistic['mode'].append(mode)
statistic['classifier'].append(name)
latex_name = '$' + Samples[str(mode)]['root'].replace("#", "\\") + '$'
statistic['name'].append(latex_name)
sig_prediction = cl.predict_proba(sig_mode)[:, 1]
sig_event_prediction = voting_for_event_svr(sig_mode['event_number'], sig_prediction)
for rate in rates:
# important: greater, not >=
thr = thresholds[name][rate]
exist_sig = numpy.sum(sig_event_prediction > thr)
eff = exist_sig * 1. / total_events[mode]
statistic[rate].append(eff * 100.)
modes_eff[(name, rate)][latex_name] = eff
return modes_eff, statistic
In [5]:
empty_events = dict()
with open('generate_hlt1.log', 'r') as file:
for line in file:
if 'Mode' in line:
mode = line.strip().split(':')[-1]
elif 'No sv' in line:
empty = int(line.strip().split(':')[-1])
else:
empty_events[int(mode)] = empty
In [6]:
def print_eff_dict(eff):
for key, val in eff.items():
print key
for k, v in val.items():
print k, ':', v
In [7]:
import numpy
In [8]:
from pprint import pprint
from collections import defaultdict, OrderedDict
def calculate_thresholds(test_bck, y_pred_bck, total_bck_events, rates, id_column='event_number'):
required_size = dict([(rate, int(rate * total_bck_events / 1e6)) for rate in rates])
bck_event_prediction = voting_for_event_svr(test_bck[id_column], y_pred_bck[:, 1])
thresholds = dict()
result = dict()
for rate, req_size in required_size.items():
threshold = numpy.percentile(bck_event_prediction, 100. * (1 - req_size * 1. / len(bck_event_prediction)))
thresholds[rate] = threshold
exist_bck = numpy.sum(bck_event_prediction > threshold)
result[rate] = (threshold, exist_bck, 1. * exist_bck / total_bck_events)
return thresholds, result
def voting_for_event_svr(ids, prediction):
df = pandas.DataFrame({'prediction': numpy.array(prediction), 'id': numpy.array(ids)})
return numpy.array(df.groupby('id')['prediction'].max())
In [9]:
from sklearn.metrics import roc_curve, roc_auc_score
def plot_roc_events(estimator, sig_body, bck_body, label='', normed_channels=True, not_passed_events_sig=0,
not_passed_events_bck=0):
test_all = pandas.concat([sig_body, bck_body])
pred_all = estimator.predict_proba(test_all)[:, 1]
sig_ = voting_for_event_svr(sig_body['unique'].values, pred_all[:len(sig_body)])
sig_mode = voting_for_event_svr(sig_body['unique'].values, numpy.array(sig_body['mode']))
sig_ = numpy.concatenate([sig_, numpy.zeros(not_passed_events_sig)])
sample_weight = numpy.ones(len(sig_))
if normed_channels:
for val in numpy.unique(sig_mode):
sample_weight[sig_mode == val] /= 1. * len(sample_weight[sig_mode == val])
bck_ = voting_for_event_svr(bck_body['unique'].values, pred_all[len(sig_body):])
bck_ = numpy.concatenate([bck_, numpy.zeros(not_passed_events_bck)])
fpr, tpr, _ = roc_curve([1] * len(sig_) + [0] * len(bck_), numpy.concatenate([sig_, bck_]),
sample_weight=numpy.concatenate([sample_weight, numpy.ones(len(bck_))]))
print label, 'AUC:', roc_auc_score([1] * len(sig_) + [0] * len(bck_), numpy.concatenate([sig_, bck_]),
sample_weight=numpy.concatenate([sample_weight, numpy.ones(len(bck_))]))
return fpr, tpr
In [10]:
def compute_n_events_pass_threshold(event_number, predictions, threshold):
return (numpy.bincount(event_number, weights=predictions > threshold) > 0).sum()
In [11]:
def generate_topo_metric(test_bck, test_sig_mode, total_bck_events,
total_sig_mode_events, rate, id_column='event_number'):
def topo_metric(y_true, y_pred, sample_weight=None):
threshold, _ = calculate_thresholds(test_bck, y_pred[y_true == 0],
total_bck_events, [rate], id_column=id_column)
threshold = threshold[rate]
exist_sig = compute_n_events_pass_threshold(numpy.array(test_sig_mode[id_column]), y_pred[y_true==1, 1], threshold)
return exist_sig * 1. / total_sig_mode_events
return topo_metric
In [12]:
def final_eff_for_mode(test, prediction, total_events, threshold, id_column='event_number'):
event_prediction = voting_for_event_svr(numpy.array(test[id_column]), prediction[:, 1])
exist = numpy.sum(event_prediction >= threshold)
return exist * 1. / total_events
In [13]:
%run scripts/HltSamples.py
In [ ]:
def get_best_svr(data, estimator, count=1):
probs = estimator.predict_proba(data)[:, 1]
train_prob = data.copy()
train_prob['prediction'] = probs
good_events = train_prob[train_prob['signal'] == 0].copy()
add_events = []
for num, group in train_prob[train_prob['signal'] == 1].groupby('unique'):
index = numpy.argsort(group['prediction'].values)[::-1]
add_events.append(group.iloc[index[:count], :])
good_events = pandas.concat([good_events] + add_events)
print len(good_events)
return good_events