In [1]:
%%bash
mkdir -p /mldb_data/data
curl "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv" 2>/dev/null  > /mldb_data/data/train-1m.csv
curl "https://s3.amazonaws.com/benchm-ml--main/test.csv" 2>/dev/null  > /mldb_data/data/test.csv

In [3]:
from pymldb import Connection
mldb = Connection()

In [18]:
%%time

mldb.v1.procedures("benchmark").put_json({
    "type": "classifier.experiment",
    "params": {
        "experimentName": "benchm_ml",
        "training_dataset": {
            "type": "text.csv.tabular",
            "params": { "dataFileUrl": "file:///mldb_data/data/train-1m.csv" }
        },
        "testing_dataset": {
            "type": "text.csv.tabular",
            "params": { "dataFileUrl": "file:///mldb_data/data/test.csv" }
        },
        "configuration": {
            "type": "bagging",
            "num_bags": 100,
            "validation_split": 0.50,
            "weak_learner": {
                "type": "decision_tree",
                "max_depth": 19,
                "random_feature_propn": 0.5
            }
        },
        "modelFileUrlPattern": "file://tmp/models/benchml_$runid.cls",
        "label": "dep_delayed_15min = 'Y'",
        "select": "* EXCLUDING(dep_delayed_15min)",
        "mode": "boolean"
    }
})

result = mldb.v1.procedures("benchmark").runs.post_json({})
print "\n\nAUC = %0.4f\n\n" % result.json()["status"]["folds"][0]["results"]["auc"]



AUC = 0.7417


CPU times: user 4.68 ms, sys: 4.62 ms, total: 9.3 ms
Wall time: 16.4 s

In [ ]: