In [1]:
import pandas
from sklearn.utils import column_or_1d
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.cross_validation import train_test_split
In [2]:
def get_higgs_data(train_file):
""" reads the data and normalizes weights """
data = pandas.read_csv(train_file, index_col='EventId')
answers_bs = numpy.ravel(data.Label)
weights = numpy.ravel(data.Weight)
data = data.drop(['Label', 'Weight'], axis=1)
answers = numpy.zeros(len(answers_bs), dtype=numpy.int)
answers[answers_bs == 's'] = 1
for label in [0, 1]:
weights[answers == label] /= weights[answers == label].mean()
return data, answers, weights
In [3]:
def compute_ams_on_cuts(answers, predictions, sample_weight):
""" Predictions are probabilities"""
b, s, thresholds = roc_curve(answers, predictions, sample_weight=sample_weight)
# normalization constants
real_s = 691.988607712
real_b = 410999.847322
s *= real_s
b *= real_b
br = 10.
radicands = 2 * ((s + b + br) * numpy.log(1.0 + s/(b + br)) - s)
return thresholds, radicands
def optimal_AMS(answers, predictions, sample_weight):
""" Predictions are probabilities """
cuts, radicands = compute_ams_on_cuts(answers, predictions, sample_weight)
return numpy.sqrt(numpy.max(radicands))
def precisionAt15(answers, predictions, sample_weight, percent=0.15):
n_passed = int(len(answers) * percent)
RATIO = 50
weight = sample_weight.copy()
weight[answers == 0] /= weight[answers == 0].mean() / RATIO
weight[answers == 1] /= weight[answers == 1].mean()
order = numpy.argsort(-predictions)
passed = order[:n_passed]
return numpy.average(answers[passed], weights=weight[passed])
In [4]:
def print_control_metrics(proba_test, proba_train):
for name, metrics in [('ROC', roc_auc_score), ('AMS', optimal_AMS), ('precision', precisionAt15)]:
print name,
print metrics(testY, proba_test, sample_weight=testW),
print metrics(trainY, proba_train, sample_weight=trainW)
In [5]:
X, y, weights = get_higgs_data('/mnt/w76/notebook/datasets/higgs/training.csv')
In [6]:
trainX, testX, trainY, testY, trainW, testW = train_test_split(X, y, weights, train_size=0.51, random_state=42)
In [7]:
from rep.classifiers import EventFilterClassifier
In [8]:
ef = EventFilterClassifier(url_base='w80.h.cern.yandex.net', iterations=10000)
In [9]:
ef.fit(trainX, trainY, sample_weight=trainW)
Out[9]:
In [10]:
ylim(3.5, 4.0)
plot([optimal_AMS(testY, p[:, 1], sample_weight=testW) for p in ef.staged_predict_proba(testX, step=10)])
grid()
In [11]:
plot([precisionAt15(testY, p[:, 1], sample_weight=testW, ) for p in ef.staged_predict_proba(testX, step=10)])
grid()
In [12]:
plot([roc_auc_score(testY, p[:, 1], sample_weight=testW) for p in ef.staged_predict_proba(testX, step=10)])
grid()
In [13]:
print_control_metrics(ef.predict_proba(testX)[:, 1], ef.predict_proba(trainX)[:, 1])
In [14]:
with open('formula.mx', 'wb') as mx:
mx.write(ef.formula_mx)
In [22]:
from _matrixnetapplier import MatrixnetClassifier
from StringIO import StringIO
In [23]:
with open('formula.mx', 'rb') as mx:
formula = mx.read()
In [24]:
clf = MatrixnetClassifier(StringIO(formula))
In [25]:
test_predictions = clf.apply(testX)
train_predictions = clf.apply(trainX)
In [26]:
print_control_metrics(test_predictions, train_predictions)