In [1]:
import pandas
from sklearn.utils import column_or_1d
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.cross_validation import train_test_split

In [2]:
def get_higgs_data(train_file):
    """ reads the data and normalizes weights """
    data = pandas.read_csv(train_file, index_col='EventId')
    answers_bs = numpy.ravel(data.Label)
    weights = numpy.ravel(data.Weight)
    data = data.drop(['Label', 'Weight'], axis=1)
    answers = numpy.zeros(len(answers_bs), dtype=numpy.int)
    answers[answers_bs == 's'] = 1
    for label in [0, 1]:
        weights[answers == label] /= weights[answers == label].mean()
    return data, answers, weights

In [3]:
def compute_ams_on_cuts(answers, predictions, sample_weight):
    """ Predictions are probabilities"""
    b, s, thresholds = roc_curve(answers, predictions, sample_weight=sample_weight)
    # normalization constants
    real_s = 691.988607712
    real_b = 410999.847322
    s *= real_s
    b *= real_b
    br = 10.
    radicands = 2 * ((s + b + br) * numpy.log(1.0 + s/(b + br)) - s)
    return thresholds, radicands

def optimal_AMS(answers, predictions, sample_weight):
    """ Predictions are probabilities """
    cuts, radicands = compute_ams_on_cuts(answers, predictions, sample_weight)
    return numpy.sqrt(numpy.max(radicands))


def precisionAt15(answers, predictions, sample_weight, percent=0.15):
    n_passed = int(len(answers) * percent)
    RATIO = 50
    weight = sample_weight.copy()
    weight[answers == 0] /= weight[answers == 0].mean() / RATIO
    weight[answers == 1] /= weight[answers == 1].mean()
    order = numpy.argsort(-predictions)
    passed = order[:n_passed]    
    return numpy.average(answers[passed], weights=weight[passed])

In [4]:
def print_control_metrics(proba_test, proba_train):
    for name, metrics in [('ROC', roc_auc_score), ('AMS', optimal_AMS), ('precision', precisionAt15)]:
        print name,
        print metrics(testY, proba_test, sample_weight=testW), 
        print metrics(trainY, proba_train, sample_weight=trainW)

Читаем данные


In [5]:
X, y, weights = get_higgs_data('/mnt/w76/notebook/datasets/higgs/training.csv')

In [6]:
trainX, testX, trainY, testY, trainW, testW = train_test_split(X, y, weights, train_size=0.51, random_state=42)

Тренируем классификатор


In [7]:
from rep.classifiers import EventFilterClassifier

In [8]:
ef = EventFilterClassifier(url_base='w80.h.cern.yandex.net', iterations=10000)

In [9]:
ef.fit(trainX, trainY, sample_weight=trainW)


Out[9]:
EventFilterClassifier(baseline=None, boarder=0.5, command_line_params=None,
           dataset_name='dataset-notebook_196a6188-5883-43e3-b702-523a9b18034d--851684',
           dump_filename=None,
           features=['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10', 'Feature_11', 'Feature_12', 'Feature_13', 'Feature_14', 'Feature_15', 'Feature_16', 'Feature_17', 'Feature_18', 'Feature_19', 'Feature_20', 'Feature_21', 'Feature_22', 'Feature_23', 'Feature_24', 'Feature_25', 'Feature_26', 'Feature_27', 'Feature_28', 'Feature_29'],
           features_sample_rate_per_iteration=1.0, formula_name='EF',
           intervals=64, iterations=10000, max_features_per_iteration=6,
           password=None, regularization=0.01, sync=True,
           training_fraction=0.5, url_base='w80.h.cern.yandex.net',
           user=None)

In [10]:
ylim(3.5, 4.0)
plot([optimal_AMS(testY, p[:, 1], sample_weight=testW) for p in ef.staged_predict_proba(testX, step=10)])
grid()



In [11]:
plot([precisionAt15(testY, p[:, 1], sample_weight=testW, ) for p in ef.staged_predict_proba(testX, step=10)])
grid()



In [12]:
plot([roc_auc_score(testY, p[:, 1], sample_weight=testW) for p in ef.staged_predict_proba(testX, step=10)])
grid()



In [13]:
print_control_metrics(ef.predict_proba(testX)[:, 1], ef.predict_proba(trainX)[:, 1])


ROC 0.93583436349 0.961031817164
AMS 3.71746365006 5.41899018231
precision 0.266316147498 0.394846517557

Сохраняем обученный классификатор


In [14]:
with open('formula.mx', 'wb') as mx:
    mx.write(ef.formula_mx)

Загружаем формулу


In [22]:
from _matrixnetapplier import MatrixnetClassifier
from StringIO import StringIO

In [23]:
with open('formula.mx', 'rb') as mx:
    formula = mx.read()

In [24]:
clf = MatrixnetClassifier(StringIO(formula))

In [25]:
test_predictions = clf.apply(testX)
train_predictions = clf.apply(trainX)

Проверяем контрольные цифры


In [26]:
print_control_metrics(test_predictions, train_predictions)


ROC 0.935834364399 0.961031816114
AMS 3.71746365006 5.41899018231
precision 0.266316147498 0.394846517557