In [4]:
#https://github.com/yandexdataschool/flavours-of-physics-start/blob/master/flatness_boosting.ipynb

import pandas
import evaluation
import hep_ml.ugradientBoosting


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-4-60124f44e744> in <module>()
      2 
      3 import pandas
----> 4 import evaluation
      5 import hep_ml.ugradientBoosting

ImportError: No module named evaluation

In [1]:
#https://www.kaggle.com/benhamner/flavours-of-physics/rf-xgboost-example/code
'''
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

print("Load the training/test data using pandas")
train = pd.read_csv("../input/training.csv")
test  = pd.read_csv("../input/test.csv")

print("Eliminate SPDhits, which makes the agreement check fail")
features = list(train.columns[1:-5])

print("Train a Random Forest model")
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(train[features], train["signal"])

print("Train a XGBoost model")
params = {"objective": "binary:logistic",
          "eta": 0.3,
          "max_depth": 5,
          "min_child_weight": 3,
          "silent": 1,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "seed": 1}
num_trees=250
gbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees)

print("Make predictions on the test set")
test_probs = (rf.predict_proba(test[features])[:,1] +
              gbm.predict(xgb.DMatrix(test[features])))/2
submission = pd.DataFrame({"id": test["id"], "prediction": test_probs})
submission.to_csv("rf_xgboost_submission.csv", index=False)
                
'''


Out[1]:
'\nimport numpy as np\nimport pandas as pd\nfrom sklearn.ensemble import RandomForestClassifier\nimport xgboost as xgb\n\nprint("Load the training/test data using pandas")\ntrain = pd.read_csv("../input/training.csv")\ntest  = pd.read_csv("../input/test.csv")\n\nprint("Eliminate SPDhits, which makes the agreement check fail")\nfeatures = list(train.columns[1:-5])\n\nprint("Train a Random Forest model")\nrf = RandomForestClassifier(n_estimators=100, random_state=1)\nrf.fit(train[features], train["signal"])\n\nprint("Train a XGBoost model")\nparams = {"objective": "binary:logistic",\n          "eta": 0.3,\n          "max_depth": 5,\n          "min_child_weight": 3,\n          "silent": 1,\n          "subsample": 0.7,\n          "colsample_bytree": 0.7,\n          "seed": 1}\nnum_trees=250\ngbm = xgb.train(params, xgb.DMatrix(train[features], train["signal"]), num_trees)\n\nprint("Make predictions on the test set")\ntest_probs = (rf.predict_proba(test[features])[:,1] +\n              gbm.predict(xgb.DMatrix(test[features])))/2\nsubmission = pd.DataFrame({"id": test["id"], "prediction": test_probs})\nsubmission.to_csv("rf_xgboost_submission.csv", index=False)\n                \n'

In [1]:
import numpy as np
import pandas as pd
from hep_ml.losses import BinFlatnessLossFunction
from hep_ml.gradientboosting import UGradientBoostingClassifier

In [2]:
# https://www.kaggle.com/benhamner/flavours-of-physics/flatness-boosting-example/files
print("Load the training/test data using pandas")
folder = "/home/bakuda/ageekrepo/kaggle/flavor-of-physics"
train = pd.read_csv(folder + "/training.csv")
test  = pd.read_csv(folder + "/test.csv")

print("Eliminate SPDhits, which makes the agreement check fail")
features = list(train.columns[1:-5])

print("Train a UGradientBoostingClassifier")
loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)
clf = UGradientBoostingClassifier(loss=loss, n_estimators=40, subsample=0.1, 
                                  max_depth=7, min_samples_leaf=10,
                                  learning_rate=0.1, train_features=features, random_state=11)
clf.fit(train[features + ['mass']], train['signal'])


Load the training/test data using pandas
Eliminate SPDhits, which makes the agreement check fail
Train a UGradientBoostingClassifier
Out[2]:
UGradientBoostingClassifier(learning_rate=0.1,
              loss=BinFlatnessLossFunction(allow_wrong_signs=True, fl_coefficient=3.0, n_bins=15,
            power=2.0, uniform_features=['mass'], uniform_label=array([0])),
              max_depth=7, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=2, n_estimators=40,
              random_state=<mtrand.RandomState object at 0xb7aa86c>,
              splitter='best', subsample=0.1,
              train_features=['LifeTime', 'dira', 'FlightDistance', 'FlightDistanceError', 'IP', 'IPSig', 'VertexChi2', 'pt', 'DOCAone', 'DOCAtwo', 'DOCAthree', 'IP_p0p2', 'IP_p1p2', 'isolationa', 'isolationb', 'isolationc', 'isolationd', 'isolatione', 'isolationf', 'iso', 'CDF1', 'CDF2', 'CDF3', 'ISO_SumBDT', 'p...IPSig', 'p2_IPSig', 'p0_pt', 'p1_pt', 'p2_pt', 'p0_p', 'p1_p', 'p2_p', 'p0_eta', 'p1_eta', 'p2_eta'],
              update_tree=True)

In [3]:
print("Make predictions on the test set")
test_probs = clf.predict_proba(test[features])[:,1]
submission = pd.DataFrame({"id": test["id"], "prediction": test_probs})
submission.to_csv("flatness_boosting_submission.csv", index=False)


Make predictions on the test set

In [ ]: