In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
In [2]:
import numpy
import root_numpy
# import pandas - no pandas today
from astropy.table import Table
from sklearn.metrics import roc_auc_score
from scipy.special import logit
In [3]:
# theano imports
import theano
from theano import tensor as T
from theano.tensor.nnet import softplus
from theano.tensor.extra_ops import bincount
In [4]:
features = [
# track itself
'eta', 'partPt', 'partP',
# track and B
'cos_diff_phi', 'proj', 'diff_eta', 'ptB', 'R_separation', 'proj_T', 'proj_T2',
# PID
'PIDNNe', 'PIDNNk', 'PIDNNm', 'ghostProb',
# IP
'IP', 'IPerr', 'IPs', 'IPPU',
# Other
'veloch', 'partlcs', 'EOverP',
# deleted as probably inappropriate:
# 'phi',
# 'diff_pt', 'nnkrec',
# 'max_PID_mu_e', 'max_PID_mu_k', 'sum_PID_k_e', 'sum_PID_mu_e', 'max_PID_k_e', 'sum_PID_mu_k',
]
In [5]:
# data = Table(root_numpy.root2array('../datasets/MC/csv/WG/Bu_JPsiK/2012/Tracks.root', stop=3000))
data = Table(root_numpy.root2array('../datasets/MC/csv/WG/Bu_JPsiK/2012/Tracks.root', stop=30000000))
data['label'] = (data['signB'] * data['signTrack']) > 0
data['cos_diff_phi'] = numpy.cos(data['diff_phi'])
data['diff_pt'] = data['ptB'] - data['partPt']
data['R_separation'] = numpy.sqrt(data['diff_eta'] ** 2 + (1 - data['cos_diff_phi']) ** 2)
# projection in transverse plane
data['proj_T'] = data['cos_diff_phi'] * data['partPt']
data['proj_T2'] = data['cos_diff_phi'] * data['partPt'] * data['ptB']
data = data[data['ghostProb'] < 0.4]
data = data[numpy.isfinite(data['IPs'])]
In [6]:
groups = data['run'] + data['event'].astype(int) * (data['run'].max() + 1)
_, data['group_column'] = numpy.unique(groups, return_inverse=True)
data = data.group_by('group_column')
In [7]:
import sys
sys.path.insert(0, '../')
In [8]:
def compute_weights(data, attention):
"""
Weights are normalized over events. Higher convenience - higher weights
"""
assert len(numpy.shape(attention)) == 1
weights = numpy.exp(attention)
sum_weights = numpy.bincount(data['group_column'], weights=weights)
return weights / (sum_weights[data['group_column']] + 1)
In [9]:
def compute_auc_with_attention(data, track_proba, track_attention):
assert track_proba.shape == (len(data), 2)
assert len(track_attention) == len(data)
tracks_weights = compute_weights(data, track_attention)
event_predictions = numpy.bincount(
data['group_column'], weights=logit(track_proba[:, 1]) * data['signTrack'] * tracks_weights)
B_signs = data['signB'].group_by(data['group_column']).groups.aggregate(numpy.mean)
return roc_auc_score(B_signs, event_predictions)
In [10]:
from folding_group import FoldingGroupClassifier
from decisiontrain import DecisionTrainClassifier
In [11]:
base_clf = DecisionTrainClassifier(n_estimators=1000, learning_rate=0.03, n_threads=len(features),
train_features=features, max_features=0.9)
In [12]:
%%time
dt = FoldingGroupClassifier(base_clf, n_folds=2, group_feature='group_column')
_ = dt.fit(data[features + ['group_column']].to_pandas(), data['label'])
In [13]:
# raw quality
B_signs = data['signB'].group_by(data['group_column']).groups.aggregate(numpy.mean)
track_proba = dt.predict_proba(data.to_pandas())
predictions = numpy.bincount(data['group_column'], weights=logit(track_proba[:, 1]) * data['signTrack'])
roc_auc_score(B_signs, predictions)
Out[13]:
In [14]:
compute_auc_with_attention(data,
track_proba=dt.predict_proba(data.to_pandas()),
track_attention=numpy.zeros(len(data)) - 2)
Out[14]:
In [15]:
compute_auc_with_attention(data,
track_proba=dt.predict_proba(data.to_pandas()),
track_attention=numpy.zeros(len(data)) - 2)
Out[15]:
In [16]:
%%time
_n_tracks = numpy.bincount(data['group_column'])[data['group_column']]
_weights = (_n_tracks > 5) & (_n_tracks < 40)
dt_on_filtered = FoldingGroupClassifier(base_clf, n_folds=2, group_feature='group_column')
_ = dt_on_filtered.fit(data[features + ['group_column']].to_pandas(), data['label'], sample_weight=_weights)
In [17]:
# on filtered dataset
compute_auc_with_attention(data,
track_proba=dt_on_filtered.predict_proba(data.to_pandas()),
track_attention=numpy.zeros(len(data)) - 2)
Out[17]:
In [18]:
sorted(zip(dt.estimators[0].feature_importances_ + dt.estimators[1].feature_importances_, features))
Out[18]:
In [19]:
sorted(zip(dt.estimators[0].feature_importances_ + dt.estimators[1].feature_importances_, features))
Out[19]:
In [20]:
long_dt = FoldingGroupClassifier(
DecisionTrainClassifier(n_estimators=3000, learning_rate=0.02, max_features=0.9,
n_threads=len(features), train_features=features),
n_folds=2, group_feature='group_column')
In [21]:
_ = long_dt.fit(data[features + ['group_column']].to_pandas(),
data['label'],
sample_weight=compute_weights(data, numpy.zeros(len(data)) - 2.))
In [22]:
for i, p in enumerate(long_dt.staged_predict_proba(data.to_pandas()), 1):
if i % 5 == 0:
print compute_auc_with_attention(data, track_proba=p, track_attention=numpy.zeros(len(data)) - 2.)
In [23]:
weights = compute_weights(data, attention=numpy.zeros(len(data)))
In [24]:
dt_simpleweights = FoldingGroupClassifier(base_clf, n_folds=2, group_feature='group_column')
dt_simpleweights.fit(data[features + ['group_column']].to_pandas(), data['label'], sample_weight=weights);
In [25]:
compute_auc_with_attention(data,
track_proba=dt_simpleweights.predict_proba(data.to_pandas()),
track_attention=numpy.zeros(len(data)))
Out[25]:
In [26]:
n_tracks = numpy.bincount(data['group_column'])
In [27]:
plt.plot(numpy.bincount(n_tracks) * numpy.arange(max(n_tracks) + 1))
plt.xlabel('n_tracks in event')
plt.ylabel('n_tracks in total')
Out[27]:
In [28]:
# plt.plot(* zip(*(
# [i, roc_auc_score(B_signs[n_tracks == i], predictions[n_tracks == i])]
# for i in range(2, 60)
# )))
In [29]:
correctness = logit(track_proba[:, 1]) * (2 * data['label'] - 1)
In [30]:
for percentile in [50, 60, 70, 80]:
dt_attention = FoldingGroupClassifier(base_clf, n_folds=2, group_feature='group_column')
dt_attention.fit(data[features + ['group_column']].to_pandas(),
correctness > numpy.percentile(correctness, percentile))
attention = logit(dt_attention.predict_proba(data[features + ['group_column']].to_pandas())[:, 1])
if percentile == 70:
stable_attention = attention.copy()
attention_weights = compute_weights(data, attention)
dt_classifier = FoldingGroupClassifier(base_clf, n_folds=2, group_feature='group_column')
dt_classifier.fit(data[features + ['group_column']].to_pandas(),
data['label'], sample_weight=attention_weights)
print percentile, compute_auc_with_attention(data, dt_classifier.predict_proba(data.to_pandas()), attention)
In [31]:
_proba = dt.predict_proba(data.to_pandas())
for alpha in [-3, -2, -1, 0]:
for beta in numpy.linspace(0.5, 1.2, 5):
print alpha, '\t', beta, '\t', compute_auc_with_attention(data, _proba, alpha + beta * stable_attention)
In [32]:
# lazy start
_attention = numpy.zeros(len(data))
_correctness = numpy.zeros(len(data))
for iteration in range(5):
dt_classifier = FoldingGroupClassifier(base_clf, n_folds=2, group_feature='group_column')
dt_classifier.fit(data[features + ['group_column']].to_pandas(),
data['label'], sample_weight=compute_weights(data, _attention))
_correctness = logit(dt_classifier.predict_proba(data.to_pandas())[:, 1]) * (2 * data['label'] - 1)
print compute_auc_with_attention(data, dt_classifier.predict_proba(data.to_pandas()), _attention)
dt_attention = FoldingGroupClassifier(base_clf, n_folds=2, group_feature='group_column')
dt_attention.fit(data[features + ['group_column']].to_pandas(),
_correctness > numpy.percentile(_correctness, 70))
_attention = logit(dt_attention.predict_proba(data.to_pandas())[:, 1])
print compute_auc_with_attention(data, dt_classifier.predict_proba(data.to_pandas()), _attention)
In [33]:
# TODO fast orders
# from astropy.table import Column
# def compute_orders(indices, predictions):
# '''less is better'''
# assert len(indices) == len(predictions)
# result = numpy.zeros(len(predictions))
# for index in numpy.unique(indices):
# result[indices == index] = numpy.argsort(numpy.argsort( - predictions[indices == index]))
# return result
In [34]:
# orders = compute_orders(data['group_column'], correctness)
In [35]:
# for percentile in [60, 70, 80]:
# for n_tracks in [3, 5, 7]:
# # lazy start
# _attention = numpy.zeros(len(data))
# dt_classifier = FoldingGroupClassifier(base_clf, n_folds=2, group_feature='group_column')
# dt_classifier.fit(data[features + ['group_column']].to_pandas(),
# data['label'], sample_weight=compute_weights(data, _attention))
# _correctness = logit(dt_classifier.predict_proba(data.to_pandas())[:, 1]) * (2 * data['label'] - 1)
# dt_attention = FoldingGroupClassifier(base_clf, n_folds=2, group_feature='group_column')
# dt_attention.fit(data[features + ['group_column']].to_pandas(),
# (_correctness > numpy.percentile(_correctness, percentile)) * (orders < n_tracks) )
# _attention = logit(dt_attention.predict_proba(data.to_pandas())[:, 1])
# print n_tracks, '\t', percentile, '\t', \
# compute_auc_with_attention(data, dt_classifier.predict_proba(data.to_pandas()), _attention)
In [36]:
sorted(zip(dt_classifier.estimators[0].feature_importances_, features))
Out[36]:
In [37]:
sorted(zip(dt_attention.estimators[0].feature_importances_, features))
Out[37]:
In [38]:
def loss_function(b_signs, event_indices, d_tracksign, d_is_tagging):
track_contributions = T.exp(d_is_tagging)
track_contributions /= (T.extra_ops.bincount(event_indices, track_contributions) + 1)[event_indices]
d_B = bincount(event_indices, d_tracksign * track_contributions)
return T.mean(T.exp(- b_signs * d_B))
# return T.mean(b_signs * d_B)
In [39]:
from sklearn.base import BaseEstimator
class AttentionLoss(BaseEstimator):
def __init__(self):
pass
def fit(self, X, y, sample_weight=None):
# self.sample_weight = numpy.require(sample_weight, dtype='float32')
self.sample_weight = numpy.ones(len(X))
self.y_signed = numpy.require(2 * y - 1, dtype='float32')
_, first_positions, event_indices = numpy.unique(X['group_column'].values, return_index=True, return_inverse=True)
self.b_signs = numpy.array(X['signB'].values)[first_positions]
d_is_tagging_var = T.vector()
self.Loss = loss_function(
self.b_signs, event_indices,
d_tracksign=numpy.array(X['trackpredictions'].values),
d_is_tagging=d_is_tagging_var
)
self.grad = theano.function([d_is_tagging_var], - T.grad(self.Loss, d_is_tagging_var))
return self
def prepare_tree_params(self, pred):
_grad = numpy.sign(self.grad(pred))
return _grad / numpy.std(_grad), self.sample_weight
In [40]:
data2 = data.copy()
data2['trackpredictions'] = logit(dt.predict_proba(data.to_pandas())[:, 1])
In [41]:
# _base = DecisionTrainClassifier(loss=AttentionLoss(), n_estimators=1000,
# learning_rate=0.03, n_threads=16, train_features=features)
# _features = features + ['group_column', 'signB', 'trackpredictions']
# dt_grouping = FoldingGroupClassifier(_base, n_folds=2, group_feature='group_column', train_features=_features)
# dt_grouping.fit(data2[_features].to_pandas(),
# data2['label']);
In [42]:
# attention = logit(dt_grouping.predict_proba(data2.to_pandas())[:, 1])
In [43]:
from rep.metaml import FoldingRegressor
from decisiontrain import DecisionTrainRegressor
In [44]:
simple_weights = compute_weights(data, attention=numpy.zeros(len(data)))
In [45]:
# Пробуем тренировку на перцентиль. Пока что самый стабильный вариант
# folding = FoldingRegressor(DecisionTrainRegressor(n_estimators=1000))
# folding.fit(data2[features].to_pandas(), correctness > numpy.percentile(correctness, 70), sample_weight=simple_weights);
In [46]:
# Тренировка на корректность работает плохо
# folding = FoldingRegressor(DecisionTrainRegressor(n_estimators=1000))
# folding.fit(data2[features].to_pandas(), correctness / numpy.std(correctness), sample_weight=simple_weights);
In [47]:
# Пробуем тренировку на порядок. Лучше, чем ничего, хуже, чем перцентиль
# folding = FoldingRegressor(DecisionTrainRegressor(n_estimators=1000))
# folding.fit(data2[features].to_pandas(), 1.4 ** - orders, sample_weight=simple_weights);
In [48]:
# Пробуем тренировку на ranktransform от
folding = FoldingRegressor(DecisionTrainRegressor(n_estimators=1000))
folding.fit(data2[features].to_pandas(),
numpy.argsort(numpy.argsort(correctness)) / float(len(correctness)) - 0.5,
sample_weight=simple_weights);
In [49]:
# Пробуем тренировку на знак
# folding = FoldingRegressor(DecisionTrainRegressor(n_estimators=1000))
# folding.fit(data2[features].to_pandas(), numpy.sign(correctness), sample_weight=simple_weights);
In [50]:
# _correctness
In [51]:
attention = folding.predict(data2.to_pandas())
In [52]:
plt.hist(attention, bins=40);
In [53]:
compute_auc_with_attention(data, track_proba=dt.predict_proba(data2.to_pandas()),
track_attention=attention - 1.5
)
Out[53]:
In [54]:
compute_auc_with_attention(data, track_proba=dt.predict_proba(data2.to_pandas()),
track_attention=compute_weights(data, attention) - 2
)
Out[54]:
In [55]:
compute_auc_with_attention(data, track_proba=dt.predict_proba(data2.to_pandas()), track_attention=attention * 0 - 3)
Out[55]:
In [56]:
compute_auc_with_attention(data[:10**6],
track_proba=dt.predict_proba(data2.to_pandas())[:10 ** 6],
track_attention=attention[:10 ** 6] * 0 - 3)
Out[56]:
In [12]:
from sklearn.base import BaseEstimator
class DropoutLoss(BaseEstimator):
def __init__(self, p=0.0):
self.p = p
def fit(self, X, y, sample_weight=None):
self.sample_weight = numpy.require(sample_weight, dtype='float32')
self.y_signed = numpy.require(2 * y - 1, dtype='float32')
_, first_positions, self.event_indices = numpy.unique(X['group_column'].values, return_index=True, return_inverse=True)
# self.b_signs = numpy.array(X['signB'].values)[first_positions]
# track_z = - w(track) * sign(track) * sign(B)
self.track_z = - self.sample_weight * self.y_signed
self.event_losses = numpy.ones(len(first_positions))
# just in case
self.sample_weight **= 0.0
return self
def prepare_tree_params(self, pred):
# normally, prediction is weight1 * pred1 * sign1 + weight2 * pred2 * sign2 ...
# loss is exp( - isloss * decision)
# in case of dropout
track_exponents = numpy.exp(pred * self.track_z)
track_multipliers = self.p + (1 - self.p) * track_exponents
self.event_losses[:] = 1
numpy.multiply.at(self.event_losses, self.event_indices, track_multipliers)
grad = - self.event_losses[self.event_indices] / track_multipliers * track_exponents * self.track_z
return grad, self.sample_weight
In [58]:
# x = numpy.bincount(data['group_column'], weights=logit(_p[:, 1]) * (2 * data['label'] - 1) )
In [59]:
# plt.hist(x, range=[-2, 2], bins=21);
In [60]:
for p in [0.0]:
_dropout_dt = DecisionTrainClassifier(loss=DropoutLoss(p=p), n_estimators=1000,
learning_rate=0.03, n_threads=16, train_features=features)
_features = features + ['group_column']
dt_dropout = FoldingGroupClassifier(_dropout_dt, n_folds=2, group_feature='group_column', train_features=_features)
dt_dropout.fit(data[_features].to_pandas(), data['label'])
print None
for i, _p in enumerate(dt_dropout.staged_predict_proba(data.to_pandas()), 1):
if i % 2 == 0:
print compute_auc_with_attention(data, track_proba=_p, track_attention=numpy.zeros(len(data)) - 2),
print p
In [ ]:
from hep_ml.losses import AdaLossFunction
In [ ]:
_base_ada = DecisionTrainClassifier(loss=AdaLossFunction(), n_estimators=2000,
learning_rate=0.06, n_threads=16, train_features=features)
dt_exploss = FoldingGroupClassifier(_base_ada, n_folds=2, group_feature='group_column', train_features=features + ['group_column'])
_ = dt_exploss.fit(data[features + ['group_column']].to_pandas(), data['label'],
sample_weight=compute_weights(data, numpy.zeros(len(data)) - 2. ) )
In [ ]:
for i, _p in enumerate(dt_exploss.staged_predict_proba(data.to_pandas()), 1):
if i % 3 == 0:
print compute_auc_with_attention(data, track_proba=_p, track_attention=numpy.zeros(len(data)) - 2)
In [ ]:
from scipy.special import logit, expit
print compute_auc_with_attention(data, track_proba=expit(logit(_p)) , track_attention=numpy.zeros(len(data)) - 2)
print compute_auc_with_attention(data, track_proba=expit(logit(_p) * 2.) , track_attention=numpy.zeros(len(data)) - 2)
print compute_auc_with_attention(data, track_proba=expit(logit(_p) * 0.5) , track_attention=numpy.zeros(len(data)) - 2)
In [13]:
from sklearn.base import BaseEstimator
class GroupLogLoss(BaseEstimator):
def __init__(self):
pass
def fit(self, X, y, sample_weight=None):
self.sample_weight = numpy.require(sample_weight, dtype='float32')
self.y_signed = numpy.require(2 * y - 1, dtype='float32')
_, first_positions, self.event_indices = numpy.unique(X['group_column'].values,
return_index=True, return_inverse=True)
self.track_z = - self.sample_weight * self.y_signed
d = T.vector()
d_b = bincount(self.event_indices, weights=self.track_z * d)
self.Loss = T.sum(T.nnet.softplus(d_b)) * 2
self.grad = theano.function([d], -T.grad(self.Loss, d))
return self
def prepare_tree_params(self, pred):
return self.grad(pred), self.sample_weight
In [17]:
_base_dt_log = DecisionTrainClassifier(loss=GroupLogLoss(), n_estimators=2000,
learning_rate=0.05, n_threads=len(features), train_features=features)
dt_logloss = FoldingGroupClassifier(_base_dt_log, n_folds=2, group_feature='group_column',
train_features=features + ['group_column'])
_ = dt_logloss.fit(data[features + ['group_column']].to_pandas(), data['label'],
# sample_weight=compute_weights(data, numpy.zeros(len(data)) - 2. )
)
In [18]:
for i, _p in enumerate(dt_logloss.staged_predict_proba(data.to_pandas()), 1):
if i % 1 == 0:
print compute_auc_with_attention(data, track_proba=_p, track_attention=numpy.zeros(len(data)) - 2)
In [ ]:
assert 0 == 1
In [ ]:
# фильтрация ничего не дала
# _n_tracks = numpy.bincount(data['group_column'])[data['group_column']]
# _weights = (_n_tracks > 5) & (_n_tracks < 40)
# dt_logloss_filtered = FoldingGroupClassifier(_base_dt_log, n_folds=2, group_feature='group_column',
# train_features=features + ['group_column'])
# _ = dt_logloss_filtered.fit(data[features + ['group_column']].to_pandas(), data['label'],
# sample_weight=_weights)
In [ ]:
# for i, _p in enumerate(dt_logloss_filtered.staged_predict_proba(data.to_pandas()), 1):
# if i % 2 == 0:
# print compute_auc_with_attention(data, track_proba=_p, track_attention=numpy.zeros(len(data)) - 2)
In [ ]:
# два лосса. Дообучили один - переобучили второй и обратно
# требует метода "дообучи"
In [ ]:
from hep_ml.nnet import MLPClassifier
In [ ]:
nn2 = FoldingGroupClassifier(MLPClassifier(layers=[30, 20], epochs=500, scaler='iron'), n_folds=2, group_feature='group_column')
nn2.fit(data[features + ['group_column']].to_pandas(), data['label'], sample_weight=weights);
In [ ]:
track_proba = nn2.predict_proba(data.to_pandas())
predictions = numpy.bincount(data['group_column'], weights=logit(track_proba[:, 1]) * data['signTrack'] )
roc_auc_score(B_signs, predictions)
In [ ]:
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import RMSprop
In [ ]:
# train-test splitting macnually
N = len(data) // 2
train = data[data['group_column'] < numpy.median(data['group_column'])]
test = data[data['group_column'] >= numpy.median(data['group_column'])]
_, test['group_column'] = numpy.unique(test['group_column'], return_inverse=True)
_, train['group_column'] = numpy.unique(train['group_column'], return_inverse=True)
B_signs_test = test.group_by('group_column')['signB'].groups.aggregate(numpy.mean)
In [ ]:
model = Sequential()
model.add(BatchNormalization(input_shape=(len(features),)))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(200, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer=RMSprop(lr=1e-4),
metrics=['accuracy', ])
for _ in range(7):
model.fit(
train[features].to_pandas().values,
to_categorical(train['label']),
nb_epoch=1, verbose=2,
validation_data=[test[features].to_pandas().values, to_categorical(test['label'])]
)
track_proba = model.predict_proba(test[features].to_pandas().values, verbose=2)
predictions = numpy.bincount(test['group_column'],
weights=(logit(track_proba[:, 1]) * test['signTrack']))
print roc_auc_score(B_signs_test, predictions)
In [ ]:
from hep_ml.nnet import MLPClassifier, MLPBase, AbstractNeuralNetworkBase
In [ ]:
AbstractNeuralNetworkBase??
In [ ]:
def prepare(self):
activation = lambda x: x
for i, layer in list(enumerate(self.layers_))[1:]:
W = self._create_matrix_parameter('W' + str(i), self.layers_[i - 1], self.layers_[i])
# act=activation and W_=W are tricks to avoid lambda-capturing
if i == 1:
activation = lambda x, act=activation, W_=W: T.dot(act(x), W_)
else:
activation = lambda x, act=activation, W_=W: T.dot(T.tanh(act(x)), W_)
return activation