In [1]:
%pylab inline
In [2]:
%run additional.ipynb
In [3]:
pandas.set_option('display.max_colwidth', 140)
In [4]:
bck_train_mode_name = 30000000
sig_train_modes_names = list(set(empty_events.keys()) - {bck_train_mode_name})
sig_train_files = ['mod_{}.csv'.format(name) for name in sig_train_modes_names]
bck_train_files = 'mod_30000000.csv'
folder = "datasets/prepared_hlt_track/"
In [5]:
# concat all signal data
if not os.path.exists(folder + 'signal_track.csv'):
concat_files(folder, sig_train_files, os.path.join(folder , 'signal_track.csv'))
In [6]:
signal_data = pandas.read_csv(os.path.join(folder, 'signal_track.csv'), sep='\t')
bck_data = pandas.read_csv(os.path.join(folder, bck_train_files), sep='\t')
In [7]:
signal_data.columns
Out[7]:
In [8]:
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)
In [9]:
total_bck_events = statistic_length(bck_data)['Events'] + empty_events[bck_train_mode_name]
total_signal_events_by_mode = dict()
for mode in sig_train_modes_names:
total_signal_events_by_mode[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events'] + empty_events[mode]
In [10]:
print 'Bck:', total_bck_events
'Signal:', total_signal_events_by_mode
Out[10]:
In [11]:
variables_base = ["pt", "ipchi2"]
In [12]:
# hlt1 track selection
signal_data = signal_data[signal_data['pass_trk'] == 1]
bck_data = bck_data[bck_data['pass_trk'] == 1]
In [13]:
print 'Signal', statistic_length(signal_data)
print 'Bck', statistic_length(bck_data)
In [14]:
total_signal_events_by_mode_presel = dict()
for mode in sig_train_modes_names:
total_signal_events_by_mode_presel[mode] = statistic_length(signal_data[signal_data['mode'] == mode])['Events']
total_bck_events_presel = statistic_length(bck_data)['Events']
In [15]:
print 'Bck:', total_bck_events_presel
'Signal:', total_signal_events_by_mode_presel
Out[15]:
In [16]:
signal_data.head()
Out[16]:
In [17]:
ds_train_signal, ds_train_bck, ds_test_signal, ds_test_bck = prepare_data(signal_data, bck_data, 'unique')
In [18]:
print 'Signal', statistic_length(ds_train_signal)
print 'Bck', statistic_length(ds_train_bck)
In [19]:
train = pandas.concat([ds_train_bck, ds_train_signal])
In [20]:
print 'Signal', statistic_length(ds_test_signal)
print 'Bck', statistic_length(ds_test_bck)
In [21]:
test = pandas.concat([ds_test_bck, ds_test_signal])
In [22]:
total_test_bck_events = (total_bck_events - total_bck_events_presel) // 2 + statistic_length(ds_test_bck)['Events']
total_test_signal_events = dict()
for mode in sig_train_modes_names:
total_not_passed_signal = total_signal_events_by_mode[mode] - total_signal_events_by_mode_presel[mode]
total_test_signal_events[mode] = total_not_passed_signal // 2 + \
statistic_length(ds_test_signal[ds_test_signal['mode'] == mode])['Events']
In [23]:
print 'Bck total test events:', total_test_bck_events
'Signal total test events:', total_test_signal_events
Out[23]:
In [24]:
from hep_ml import nnet
from sklearn.metrics import roc_auc_score
In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats.distributions import norm
class SupervisedTransform(BaseEstimator, TransformerMixin):
def __init__(self, scale=0., like_normal=False):
"""
This transformation applies nonlinear rescale to each axis,
only order of events is taken into account.
If sig and bck events are neighbours in some feature, they will be 'splitted' by 'insertion'
Scale is how big insertion is
"""
self.like_normal = like_normal
self.scale = scale
def fit(self, X, y):
X = numpy.array(X)
self.initial_values = []
self.transformed_values = []
for axis in range(X.shape[1]):
initial_values = X[:, axis] * (1 + 1e-6 * numpy.random.normal(size=len(X)))
initial_values += 1e-8 * numpy.random.normal(size=len(X))
indices = numpy.argsort(initial_values)
initial_values = initial_values[indices]
self.initial_values.append(initial_values)
transformed = numpy.arange(len(X), dtype='float')
# increase the distance between neighs of different classes
additions = numpy.abs(numpy.diff(y[indices]))
additions = numpy.cumsum(additions)
transformed[1:] += additions * self.scale
transformed /= transformed[-1] / 2.
transformed -= 1
if self.like_normal:
# converting to normal-like distributions
transformed -= transformed[0]
transformed /= transformed[-1] / 0.9
transformed += 0.05
transformed = norm().ppf(transformed)
self.transformed_values.append(transformed)
return self
def transform(self, X):
X = numpy.array(X)
result = []
for axis, (init_vals, trans_vals) in enumerate(zip(self.initial_values, self.transformed_values)):
result.append(numpy.interp(X[:, axis], init_vals, trans_vals))
return numpy.vstack(result).T
In [26]:
models = {}
In [28]:
import cPickle
if os.path.exists('models/hlt1.pkl'):
with open('models/hlt1.pkl', 'r') as file:
models = cPickle.load(file)
In [27]:
from rep_ef.estimators import MatrixNetClassifier
In [29]:
models['MN'] = MatrixNetClassifier(connection='skygrid', connection_auth='AUTH_HEADERS',
train_features=variables_base,
iterations=5000, sync=False)
models['MN'].fit(train, train['signal'])
Out[29]:
In [30]:
models['MN'].get_feature_importances().sort('effect')
Out[30]:
In [31]:
print roc_auc_score(test['signal'], models['MN'].predict_proba(test)[:, 1])
In [32]:
from hep_ml import nnet
from rep.estimators import SklearnClassifier
base_nn = nnet.MLPClassifier(layers=[7, 7], scaler=SupervisedTransform(scale=0., ), epochs=300)
models['special NN'] = SklearnClassifier(base_nn, features=variables_base)
In [33]:
%time models['special NN'].fit(train, train['signal'])
Out[33]:
In [34]:
print roc_auc_score(test['signal'], models['special NN'].predict_proba(test)[:, 1])
print roc_auc_score(train['signal'], models['special NN'].predict_proba(train)[:, 1])
In [35]:
from rep.estimators import TheanetsClassifier
models['NN'] = TheanetsClassifier(layers=[8, 8], scaler=SupervisedTransform(scale=0., ), features=variables_base)
In [36]:
%time models['NN'].fit(train, train['signal'])
Out[36]:
In [37]:
print roc_auc_score(test['signal'], models['NN'].predict_proba(test)[:, 1])
print roc_auc_score(train['signal'], models['NN'].predict_proba(train)[:, 1])
In [38]:
from rep.estimators import SklearnClassifier
from sklearn.linear_model import LogisticRegression
models['logistic regression'] = SklearnClassifier(LogisticRegression(C=10),
features=['pt', 'ipchi2',
'llpt: log(log(pt))',
'llip: log(log(ipchi2))',
'lpt: log(pt)', 'lip: log(ipchi2)'])
models['logistic regression'].fit(train, train['signal'])
Out[38]:
In [39]:
from sklearn.ensemble import GradientBoostingClassifier
models['GB'] = SklearnClassifier(GradientBoostingClassifier(n_estimators=500, learning_rate=0.05,
max_depth=4, subsample=0.3), features=variables_base)
models['GB'].fit(train, train['signal'])
Out[39]:
In [40]:
thresholds = dict()
RATE = [100000., 50000.]
events_pass = dict()
for name, cl in models.items():
prob = cl.predict_proba(ds_test_bck)
thr, result = calculate_thresholds(ds_test_bck, prob, total_test_bck_events, rates=RATE)
thresholds[name] = thr
print name, result
for rate, val in result.items():
events_pass[rate] = val[1]
In [41]:
import cPickle
with open('models/hlt1.pkl', 'w') as file:
cPickle.dump(models, file)
In [42]:
def plot_boundary(name, rate=100000.):
t = thresholds[name][rate]
prob = models[name].predict_proba(ds_test_signal)[:, 1]
plt.figure(figsize=(10, 8))
plt.scatter(numpy.log(ds_test_signal['ipchi2'].values[prob > t]),
numpy.log(ds_test_signal['pt'].values[prob > t]),
alpha=0.01, label='signal')
plt.scatter(numpy.log(ds_test_signal['ipchi2'].values[prob <= t]),
numpy.log(ds_test_signal['pt'].values[prob <= t]),
alpha=0.01, label='misclassified signal', color='r')
plt.legend(loc='best', fontsize=16, scatterpoints=500)
plt.xlabel('log(ipchi2)', fontsize=16)
plt.ylabel('log(pt)', fontsize=16)
plt.title("Estimator: " + name, fontsize=16)
plt.show()
for name in models:
plot_boundary(name)
In [43]:
scatter(numpy.log(ds_test_signal['ipchi2'].values), numpy.log(ds_test_signal['pt'].values), alpha=0.01, label='signal')
scatter(numpy.log(ds_test_bck['ipchi2'].values), numpy.log(ds_test_bck['pt'].values), color='r', alpha=0.01, label='background')
legend(loc='best', fontsize=16, scatterpoints=500)
xlabel('log(ipchi2)', fontsize=16)
ylabel('log(pt)', fontsize=16)
title('Tracks description', fontsize=16)
# plt.savefig('hlt1_init.pdf' , format='pdf')
Out[43]:
In [44]:
def plot_eff(name, rate=100000.):
prob = models[name].predict_proba(ds_test_bck)[:, 1]
t = thresholds[name][rate]
cut_pass = prob > t
matr, b1, b2 = numpy.histogram2d(numpy.log(ds_test_bck['ipchi2'].values),
numpy.log(ds_test_bck['pt'].values), bins=(25, 20),
weights=cut_pass)
matr2, b11, b22 = numpy.histogram2d(numpy.log(ds_test_bck['ipchi2'].values),
numpy.log(ds_test_bck['pt'].values), bins=(25, 20))
figsize(10, 8)
p = plt.pcolor(matr / (matr2 + 10e-5))
plt.colorbar(p)
xlabel('log(pt) bins', fontsize=16)
title('Background track efficiency for ' + name, fontsize=16)
ylabel('log(ipchi2) bins', fontsize=16)
show()
for name in models:
plot_eff(name)
In [45]:
train_modes_eff, statistic = result_statistic(models, sig_train_modes_names, ds_test_signal,
thresholds, [100000.], total_test_signal_events)
train_modes_eff_5, statistic_5 = result_statistic(models, sig_train_modes_names, ds_test_signal,
thresholds, [50000.], total_test_signal_events)
In [46]:
from rep.plotting import BarComparePlot
BarComparePlot(train_modes_eff, sortby=('MN', 100000.0)).plot(new_plot=True, figsize=(24, 8),
ylabel='efficiency', fontsize=22)
lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('track.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
In [47]:
BarComparePlot(train_modes_eff_5, sortby=('MN', 50000.0)).plot(new_plot=True, figsize=(24, 8),
ylabel='efficiency', fontsize=22)
lgd = legend(bbox_to_anchor=(0.5, 1.4), loc='upper center', ncol=2, fontsize=22)
# plt.savefig('track_5.pdf' , format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')
In [48]:
from rep.data import LabeledDataStorage
from rep.report import ClassificationReport
lds = LabeledDataStorage(test, test['signal'])
report = ClassificationReport(models, lds)
In [49]:
report.roc()
Out[49]:
In [50]:
plots = dict()
for key, value in models.items():
plots[key] = plot_roc_events(value, ds_test_signal, ds_test_bck, key)
In [51]:
from rep.plotting import FunctionsPlot
FunctionsPlot(plots).plot(new_plot=True, xlim=(0, 0.2), ylim=(0.5, 0.95))
plot([1. * events_pass[100000.] / statistic_length(ds_test_bck)['Events']] * 2, [0., 1], 'g--', label='rate: 100 kHz')
plot([1. * events_pass[50000.] / statistic_length(ds_test_bck)['Events']] * 2, [0., 1], 'b--', label='rate: 50 kHz')
lgd = legend(loc='best', fontsize=16)