In [48]:
%pylab inline
In [1]:
import pandas
from sklearn.utils import column_or_1d
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.cross_validation import train_test_split
In [2]:
from utils import *
In [3]:
X, y, weights = get_higgs_data('../../../datasets/higgs/training.csv')
In [4]:
trainX, testX, trainY, testY, trainW, testW = train_test_split(X, y, weights, train_size=0.51, random_state=42)
In [24]:
from _matrixnetapplier import MatrixnetClassifier
from StringIO import StringIO
with open('formula.mx', 'rb') as mx:
formula_mx = mx.read()
clf = MatrixnetClassifier(StringIO(formula_mx))
In [25]:
list_classifier = utils.convert_mx_to_list(formula_mx)
In [8]:
def print_quality(predictor):
print_control_metrics(trainY, predictor(trainX.astype('float32')), trainW,
testY, predictor(testX.astype('float32')), testW,)
In [26]:
print_quality(lambda X: list_classifier.decision_function(X))
In [10]:
import sys
sys.path.insert(0, '..')
In [38]:
from pruning import simple_pruner, utils
reload(simple_pruner)
Out[38]:
In [39]:
from hep_ml.losses import BinomialDevianceLossFunction, AdaLossFunction, CompositeLossFunction
In [41]:
losses = {}
losses['exploss'] = AdaLossFunction()
losses['logloss'] = BinomialDevianceLossFunction()
losses['composite'] = CompositeLossFunction()
In [75]:
%%time
new_obdts = {}
for loss_name, loss_function in losses.items():
for regularization in [25., 50.]:
new_obdts['{} {}'.format(loss_name, regularization)] = simple_pruner.select_trees(trainX, trainY, trainW, initial_mx_formula=formula,
loss_function=loss_function,
iterations=100, n_candidates=100,
learning_rate=0.1, regularization=regularization, )
In [76]:
from rep.report import ClassificationReport
from rep.data import LabeledDataStorage
In [77]:
from rep.data import DataStorage
lds = LabeledDataStorage(DataStorage(pandas.DataFrame(testX, columns=new_obdts.values()[0].features)), labels=testY, sample_weight=testW)
In [78]:
report = ClassificationReport(new_obdts, lds=lds)
In [79]:
from rep.report.metrics import OptimalAMS, RocAuc
In [80]:
report.learning_curve(metric=RocAuc())
Out[80]:
In [81]:
report.learning_curve(metric=OptimalAMS())
Out[81]:
In [ ]: