In [4]:
#https://github.com/yandexdataschool/flavours-of-physics-start/blob/master/flatness_boosting.ipynb
import pandas
import evaluation
import hep_ml.ugradientBoosting
In [1]:
#https://www.kaggle.com/benhamner/flavours-of-physics/rf-xgboost-example/code
'''
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
print("Load the training/test data using pandas")
train = pd.read_csv("../input/training.csv")
test = pd.read_csv("../input/test.csv")
print("Eliminate SPDhits, which makes the agreement check fail")
features = list(train.columns[1:-5])
print("Train a Random Forest model")
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(train[features], train["signal"])
print("Train a XGBoost model")
params = {"objective": "binary:logistic",
"eta": 0.3,
"max_depth": 5,
"min_child_weight": 3,
"silent": 1,
"subsample": 0.7,
"colsample_bytree": 0.7,
"seed": 1}
num_trees=250
gbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees)
print("Make predictions on the test set")
test_probs = (rf.predict_proba(test[features])[:,1] +
gbm.predict(xgb.DMatrix(test[features])))/2
submission = pd.DataFrame({"id": test["id"], "prediction": test_probs})
submission.to_csv("rf_xgboost_submission.csv", index=False)
'''
Out[1]:
In [1]:
import numpy as np
import pandas as pd
from hep_ml.losses import BinFlatnessLossFunction
from hep_ml.gradientboosting import UGradientBoostingClassifier
In [2]:
# https://www.kaggle.com/benhamner/flavours-of-physics/flatness-boosting-example/files
print("Load the training/test data using pandas")
folder = "/home/bakuda/ageekrepo/kaggle/flavor-of-physics"
train = pd.read_csv(folder + "/training.csv")
test = pd.read_csv(folder + "/test.csv")
print("Eliminate SPDhits, which makes the agreement check fail")
features = list(train.columns[1:-5])
print("Train a UGradientBoostingClassifier")
loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
clf = UGradientBoostingClassifier(loss=loss, n_estimators=40, subsample=0.1,
max_depth=7, min_samples_leaf=10,
learning_rate=0.1, train_features=features, random_state=11)
clf.fit(train[features + ['mass']], train['signal'])
Out[2]:
In [3]:
print("Make predictions on the test set")
test_probs = clf.predict_proba(test[features])[:,1]
submission = pd.DataFrame({"id": test["id"], "prediction": test_probs})
submission.to_csv("flatness_boosting_submission.csv", index=False)
In [ ]: