A simple regression training using LightGBM through Fairing


In [1]:
import os
from time import gmtime, strftime
from kubeflow import fairing
from kubeflow.fairing.frameworks import lightgbm

# Setting up google container repositories (GCR) for storing output containers
# You can use any docker container registry istead of GCR
GCP_PROJECT = fairing.cloud.gcp.guess_project_name()
DOCKER_REGISTRY = 'gcr.io/{}/fairing-job'.format(GCP_PROJECT)

Setup docker credentials for AppendBuilder


In [ ]:
import subprocess
subprocess.check_call(["gcloud", "auth", "configure-docker", "--quiet"])
if os.getenv('GOOGLE_APPLICATION_CREDENTIALS'):
    subprocess.check_call(["gcloud", "auth", "activate-service-account",
                         "--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS"),
                         "--quiet"])

Launch a LightGBM train task


In [ ]:
# Creating a bucket for copying the trained model. 
# You can set gcs_bucket variable to an existing bucket name if that is desired.
gcs_bucket = "gs://{}-fairing".format(GCP_PROJECT)
!gsutil mb {gcs_bucket}

In [ ]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'metric_freq': 1,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    "n_estimators": 10,
    "is_training_metric": "true",
    "valid_data": "gs://fairing-lightgbm/regression-example/regression.test",
    "train_data": "gs://fairing-lightgbm/regression-example/regression.train",
    'verbose': 1,
    "verbose_eval": 1,
    "model_output": "{}/lightgbm/example/model_{}.txt".format(gcs_bucket, strftime("%Y_%m_%d_%H_%M_%S", gmtime())),
    "num_machines": 3,
    "tree_learner": "feature"

}

In [ ]:
lightgbm.execute(config=params,
                          docker_registry=DOCKER_REGISTRY,
                          cores_per_worker=2, # Allocating 2 CPU cores per worker instance
                          memory_per_worker=0.5, # Allocating 0.5GB of memory per worker instance
                          stream_log=True)

Let's look at the trained model


In [ ]:
url = params['model_output']
model_name = os.path.split(url)[1]
!gsutil cp {url} /tmp/{model_name}
!head /tmp/{model_name}

Runnig a prediction task using the trained model


In [ ]:
predict_params = {
    "task": "predict",
    'metric': 'l2',
    "data": "gs://fairing-lightgbm/regression-example/regression.test",
    "input_model": params['model_output'],
    "output_result": "{}/lightgbm/example/prediction_result_{}.txt".format(gcs_bucket, model_name)
}

In [ ]:
lightgbm.execute(config=predict_params, docker_registry=DOCKER_REGISTRY)

In [ ]:
url = predict_params['output_result']
file_name = os.path.split(url)[1]
!gsutil cp {url} /tmp/{file_name}

In [ ]:
import pandas as pd
predictions = pd.read_csv("/tmp/{}".format(file_name), header=None)
print("Prediction mean: {}, count: {}".format(predictions.mean()[0], predictions.count()[0]))

In [ ]: