In [15]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
data = pd.read_csv("training.csv")
In [3]:
X = data.drop(['EventId', 'Weight', 'Label'], axis=1).values
y = data['Label'].values
w = data['Weight'].values
s_weights = w.sum()
s_s_weights = data[y == 's']['Weight'].sum()
s_b_weights = data[y == 'b']['Weight'].sum()
In [4]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
X, y, w, test_size=0.2, random_state=0)
In [5]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100)
In [6]:
# balanced classification
def balance_weights(w, y):
w_balanced = w.copy() # need original weights for AMS
w_balanced[y == 's'] /= s_s_weights
w_balanced[y == 'b'] /= s_b_weights
return w_balanced
In [7]:
%%time
_ = clf.fit(X_train, y_train, balance_weights(w_train, y_train))
In [8]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred, sample_weight=balance_weights(w_test, y_test))
Out[8]:
The Approximate Median Significance
\begin{equation*}
\text{AMS} = \sqrt{ 2 \left( (s + b + 10) \ln \left( 1 + \frac{s}{b +
10} \right) - s \right) }
\end{equation*}
s and b are the sum of signal and background weights, respectively, in the selection region.
In [9]:
def AMS(s,b):
assert s >= 0
assert b >= 0
bReg = 10.
return math.sqrt(2. * ((s + b + bReg) *
math.log(1. + s / (b + bReg)) - s))
In [10]:
y_pred_proba = clf.predict_proba(X_test)
Sorting the indices in increasing order of the scores (signal-like = higher).
In [11]:
tiis = y_pred_proba[:, 1].argsort()
Weights have to be normalized to the same sum as in the full set.
In [12]:
w_factor = float(len(data)) / len(X_test)
Initializing $s$ and $b$ to the full sum of weights, we start by having all points in the selection region.
In [13]:
s = w_test[y_test == 's'].sum()
b = w_test[y_test == 'b'].sum()
amss will contain AMSs after each point moved out of the selection region in the sorted validation set. ams_max will contain the best validation AMS, and threshold will be the smallest score among the selected points. We will do len(tiis) iterations, which means that amss[-1] is the AMS when only the point with the highest score is selected.
In [16]:
amss = np.empty([len(tiis)])
ams_max = 0
threshold = 0.0
for ti in range(len(tiis)):
# don't forget to renormalize the weights to the same sum
# as in the complete training set
amss[ti] = AMS(max(0, s * w_factor), max(0, b * w_factor))
if amss[ti] > ams_max:
ams_max = amss[ti]
threshold = y_pred_proba[tiis[ti], 1]
#print tI,threshold
if y_test[tiis[ti]] == 's':
s -= w_test[tiis[ti]]
else:
b -= w_test[tiis[ti]]
In [17]:
ams_max
Out[17]:
In [18]:
threshold
Out[18]:
In [19]:
plt.plot(amss)
Out[19]:
In [20]:
test = pd.read_csv("test.csv")
In [21]:
scores = clf.predict_proba(test.drop('EventId', axis=1).values)
In [22]:
test['RankOrder'] = scores[:, 1].argsort().argsort() + 1 # trick to compute the rank order.
In [23]:
test['Class'] = ['b' if scores[i, 1] < threshold else 's' for i in range(len(scores))]
In [24]:
test.ix[:, ['EventId', 'RankOrder', 'Class']].to_csv("submission.csv", index=False)