Concat files


In [1]:
import pandas
import numpy
import os

In [2]:
CHUNKSIZE = 5000
def concat_files(folder, names, outfile):
    head = True
    mode = 'w'
    for name in names:
        pd_iterator = pandas.read_csv(os.path.join(folder, name), sep='\t', iterator=True, chunksize=CHUNKSIZE)
        for pd in pd_iterator:
            pd.to_csv(outfile, header=head, index=False, sep='\t', mode=mode)
            head = False
            mode = 'a'

Train test split


In [3]:
from rep.utils import train_test_split_group
def prepare_data(signal_data, bck_data, group_column, random_state=13):
    ds_train_signal, ds_test_signal = train_test_split_group(signal_data[group_column], signal_data, 
                                                             train_size=0.5, test_size=0.5, 
                                                             random_state=random_state)
                                                             

    ds_train_bck, ds_test_bck = train_test_split_group(bck_data[group_column], bck_data, 
                                                       train_size=0.5, test_size=0.5, 
                                                       random_state=random_state)

    return ds_train_signal, ds_train_bck, ds_test_signal, ds_test_bck

Statistic


In [ ]:
def statistic_length(data):
    return {'Events': len(numpy.unique(data['unique'])), 'SVR': len(data)}

In [4]:
# def statistic_length(data):
#     events_ids = data['mode'] * 100000000 + data['event_number']
#     return {'Events': len(numpy.unique(events_ids)), 'SVR': len(data)}

In [ ]:
from collections import defaultdict
def result_statistic(models, modes, data, thresholds, rates, total_events):
    # dict((name, rate): dict(mode: eff))
    modes_eff = defaultdict(OrderedDict)
    statistic = defaultdict(list)
    for name, cl in models.items():
        for mode in modes:
            sig_mode = data[data['mode'] == mode]
            if len(sig_mode) <= 0:
                continue
            statistic['mode'].append(mode)
            statistic['classifier'].append(name)
            latex_name = '$' + Samples[str(mode)]['root'].replace("#", "\\") + '$'
            statistic['name'].append(latex_name)
            sig_prediction = cl.predict_proba(sig_mode)[:, 1]
            sig_event_prediction = voting_for_event_svr(sig_mode['event_number'], sig_prediction)
            for rate in rates:
                # important: greater, not >=
                thr = thresholds[name][rate]
                exist_sig = numpy.sum(sig_event_prediction > thr)
                eff = exist_sig * 1. / total_events[mode]
                statistic[rate].append(eff * 100.)
                modes_eff[(name, rate)][latex_name] = eff
    return modes_eff, statistic

Empty events


In [5]:
empty_events = dict()
with open('generate_hlt1.log', 'r') as file:
    for line in file:
        if 'Mode' in line:
            mode = line.strip().split(':')[-1]
        elif 'No sv' in line:
            empty = int(line.strip().split(':')[-1])
        else:
            empty_events[int(mode)] = empty

In [6]:
def print_eff_dict(eff):
    for key, val in eff.items():
        print key
        for k, v in val.items():
            print k, ':', v

Metric definition


In [7]:
import numpy

In [8]:
from pprint import pprint
from collections import defaultdict, OrderedDict

def calculate_thresholds(test_bck, y_pred_bck, total_bck_events, rates, id_column='event_number'):
    required_size = dict([(rate, int(rate * total_bck_events / 1e6)) for rate in rates])
    bck_event_prediction = voting_for_event_svr(test_bck[id_column], y_pred_bck[:, 1])
    thresholds = dict()
    result = dict()
    for rate, req_size in required_size.items():
        threshold = numpy.percentile(bck_event_prediction, 100. * (1 - req_size * 1. / len(bck_event_prediction)))
        thresholds[rate] = threshold
        exist_bck = numpy.sum(bck_event_prediction > threshold)
        result[rate] = (threshold, exist_bck, 1. * exist_bck / total_bck_events)
    return thresholds, result

def voting_for_event_svr(ids, prediction):
    df = pandas.DataFrame({'prediction': numpy.array(prediction), 'id': numpy.array(ids)})
    return numpy.array(df.groupby('id')['prediction'].max())

In [9]:
from sklearn.metrics import roc_curve, roc_auc_score
def plot_roc_events(estimator, sig_body, bck_body, label='', normed_channels=True, not_passed_events_sig=0, 
                    not_passed_events_bck=0):
    test_all = pandas.concat([sig_body, bck_body])
    pred_all = estimator.predict_proba(test_all)[:, 1]
    sig_ = voting_for_event_svr(sig_body['unique'].values, pred_all[:len(sig_body)])
    sig_mode = voting_for_event_svr(sig_body['unique'].values, numpy.array(sig_body['mode']))
    sig_ = numpy.concatenate([sig_, numpy.zeros(not_passed_events_sig)])

    sample_weight = numpy.ones(len(sig_))
    if normed_channels:
        for val in numpy.unique(sig_mode):
            sample_weight[sig_mode == val] /= 1. * len(sample_weight[sig_mode == val]) 
        
    bck_ = voting_for_event_svr(bck_body['unique'].values, pred_all[len(sig_body):])
    bck_ = numpy.concatenate([bck_, numpy.zeros(not_passed_events_bck)])
    fpr, tpr, _ = roc_curve([1] * len(sig_) + [0] * len(bck_), numpy.concatenate([sig_, bck_]),
                           sample_weight=numpy.concatenate([sample_weight, numpy.ones(len(bck_))]))
    print label, 'AUC:', roc_auc_score([1] * len(sig_) + [0] * len(bck_), numpy.concatenate([sig_, bck_]),
                                       sample_weight=numpy.concatenate([sample_weight, numpy.ones(len(bck_))]))
    return fpr, tpr

In [10]:
def compute_n_events_pass_threshold(event_number, predictions, threshold):
    return (numpy.bincount(event_number, weights=predictions > threshold) > 0).sum()

In [11]:
def generate_topo_metric(test_bck, test_sig_mode, total_bck_events, 
                         total_sig_mode_events, rate, id_column='event_number'):
    def topo_metric(y_true, y_pred, sample_weight=None):
        threshold, _ = calculate_thresholds(test_bck, y_pred[y_true == 0], 
                                         total_bck_events, [rate], id_column=id_column)
        threshold = threshold[rate]
        exist_sig = compute_n_events_pass_threshold(numpy.array(test_sig_mode[id_column]), y_pred[y_true==1, 1], threshold)
        return  exist_sig * 1. / total_sig_mode_events
    return topo_metric

In [12]:
def final_eff_for_mode(test, prediction, total_events, threshold, id_column='event_number'):
    event_prediction = voting_for_event_svr(numpy.array(test[id_column]), prediction[:, 1])
    exist = numpy.sum(event_prediction >= threshold)
    return exist * 1. / total_events

In [13]:
%run scripts/HltSamples.py

Rank svr


In [ ]:
def get_best_svr(data, estimator, count=1):
    probs = estimator.predict_proba(data)[:, 1]
    train_prob = data.copy()
    train_prob['prediction'] = probs

    good_events = train_prob[train_prob['signal'] == 0].copy()
    add_events = []
    for num, group in train_prob[train_prob['signal'] == 1].groupby('unique'):
        index = numpy.argsort(group['prediction'].values)[::-1]
        add_events.append(group.iloc[index[:count], :])
    good_events = pandas.concat([good_events] + add_events)
    print len(good_events)
    return good_events