In [1]:
%%bash
mkdir -p /mldb_data/data
curl "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv" 2>/dev/null > /mldb_data/data/train-1m.csv
curl "https://s3.amazonaws.com/benchm-ml--main/test.csv" 2>/dev/null > /mldb_data/data/test.csv
In [3]:
from pymldb import Connection
mldb = Connection()
In [18]:
%%time
mldb.v1.procedures("benchmark").put_json({
"type": "classifier.experiment",
"params": {
"experimentName": "benchm_ml",
"training_dataset": {
"type": "text.csv.tabular",
"params": { "dataFileUrl": "file:///mldb_data/data/train-1m.csv" }
},
"testing_dataset": {
"type": "text.csv.tabular",
"params": { "dataFileUrl": "file:///mldb_data/data/test.csv" }
},
"configuration": {
"type": "bagging",
"num_bags": 100,
"validation_split": 0.50,
"weak_learner": {
"type": "decision_tree",
"max_depth": 19,
"random_feature_propn": 0.5
}
},
"modelFileUrlPattern": "file://tmp/models/benchml_$runid.cls",
"label": "dep_delayed_15min = 'Y'",
"select": "* EXCLUDING(dep_delayed_15min)",
"mode": "boolean"
}
})
result = mldb.v1.procedures("benchmark").runs.post_json({})
print "\n\nAUC = %0.4f\n\n" % result.json()["status"]["folds"][0]["results"]["auc"]
In [ ]: