In [48]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['clf']
`%matplotlib` prevents importing * from pylab and numpy

In [1]:
import pandas
from sklearn.utils import column_or_1d
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.cross_validation import train_test_split

In [2]:
from utils import *

In [3]:
X, y, weights = get_higgs_data('../../../datasets/higgs/training.csv')

In [4]:
trainX, testX, trainY, testY, trainW, testW = train_test_split(X, y, weights, train_size=0.51, random_state=42)

Загружаем формулу


In [24]:
from _matrixnetapplier import MatrixnetClassifier
from StringIO import StringIO
with open('formula.mx', 'rb') as mx:
    formula_mx = mx.read()
    
clf = MatrixnetClassifier(StringIO(formula_mx))

Перепредставляем в виде список из tuple


In [25]:
list_classifier = utils.convert_mx_to_list(formula_mx)

In [8]:
def print_quality(predictor):
    print_control_metrics(trainY, predictor(trainX.astype('float32')), trainW,
                          testY, predictor(testX.astype('float32')), testW,)

Проверяем контрольные цифры


In [26]:
print_quality(lambda X: list_classifier.decision_function(X))


ROC 0.935834364399 0.961031907788
AMS 3.71746365006 5.41910368034
precision 0.266316147498 0.394854326874

Different loss function tested


In [10]:
import sys
sys.path.insert(0, '..')

In [38]:
from pruning import simple_pruner, utils
reload(simple_pruner)


Out[38]:
<module 'pruning.simple_pruner' from '../pruning/simple_pruner.py'>

In [39]:
from hep_ml.losses import BinomialDevianceLossFunction, AdaLossFunction, CompositeLossFunction

In [41]:
losses = {}
losses['exploss'] = AdaLossFunction()
losses['logloss'] = BinomialDevianceLossFunction()
losses['composite'] = CompositeLossFunction()

In [75]:
%%time
new_obdts = {}
for loss_name, loss_function in losses.items():
    for regularization in [25., 50.]:
        new_obdts['{} {}'.format(loss_name, regularization)] = simple_pruner.select_trees(trainX, trainY, trainW, initial_mx_formula=formula, 
                                               loss_function=loss_function, 
                                               iterations=100, n_candidates=100, 
                                               learning_rate=0.1, regularization=regularization, )


CPU times: user 26min 44s, sys: 24 ms, total: 26min 44s
Wall time: 26min 44s

In [76]:
from rep.report import ClassificationReport
from rep.data import LabeledDataStorage

In [77]:
from rep.data import DataStorage
lds = LabeledDataStorage(DataStorage(pandas.DataFrame(testX, columns=new_obdts.values()[0].features)), labels=testY, sample_weight=testW)

In [78]:
report = ClassificationReport(new_obdts, lds=lds)

In [79]:
from rep.report.metrics import OptimalAMS, RocAuc

In [80]:
report.learning_curve(metric=RocAuc())


Out[80]:

In [81]:
report.learning_curve(metric=OptimalAMS())


Out[81]:

In [ ]: