In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

import os
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.datasets.california_housing import fetch_california_housing

# prevent scientific notations
pd.set_option('display.float_format', lambda x: '%.3f' % x)

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,lightgbm


Ethen 2020-03-17 17:29:50 

CPython 3.6.4
IPython 7.9.0

numpy 1.18.1
pandas 0.25.0
sklearn 0.21.2
lightgbm 2.2.4
/Users/mingyuliu/anaconda3/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  "You can install the OpenMP library by the following command: ``brew install libomp``.", UserWarning)

Tree Model Deployment

We'll try and keep the data, feature engineering, model training part as short as possible as the main focus of the repo is to build a service on top of the model.

Model Training

Loads the dataset.


In [2]:
cal_housing = fetch_california_housing()
print('feature names:', cal_housing.feature_names)
print('data shape: ', cal_housing.data.shape)

print('description:')
print(cal_housing.DESCR)


feature names: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
data shape:  (20640, 8)
description:
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bureau publishes sample data (a block group typically has a population
of 600 to 3,000 people).

It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.

.. topic:: References

    - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
      Statistics and Probability Letters, 33 (1997) 291-297

A quick train/test split.


In [3]:
test_size = 0.2
random_state = 123

X_train, X_test, y_train, y_test = train_test_split(
    cal_housing.data,
    cal_housing.target,
    test_size=test_size,
    random_state=random_state)

In [4]:
print(cal_housing.feature_names)


['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

Following the LightGBM Python Quickstart to train the model.


In [5]:
dtrain = lgb.Dataset(X_train, y_train,
                     feature_name=cal_housing.feature_names,
                     free_raw_data=False)
dtest = lgb.Dataset(X_test, y_test,
                    feature_name=cal_housing.feature_names,
                    free_raw_data=False)
dtrain


Out[5]:
<lightgbm.basic.Dataset at 0x107a544a8>

In [6]:
params_constraint = {
    'nthread': 6,
    'seed': 0,
    'metric': 'rmse',
    'eta': 0.1,
    'max_depth': 5
}
 
evals_result = {}
model = lgb.train(
    params_constraint, dtrain,
    valid_sets=[dtrain, dtest],
    evals_result=evals_result,
    num_boost_round=1000,
    early_stopping_rounds=10,
    verbose_eval=50)


Training until validation scores don't improve for 10 rounds.
[50]	training's rmse: 0.480561	valid_1's rmse: 0.506189
[100]	training's rmse: 0.429389	valid_1's rmse: 0.475466
[150]	training's rmse: 0.40234	valid_1's rmse: 0.464791
[200]	training's rmse: 0.382479	valid_1's rmse: 0.458266
[250]	training's rmse: 0.367124	valid_1's rmse: 0.45328
[300]	training's rmse: 0.353168	valid_1's rmse: 0.449646
[350]	training's rmse: 0.34103	valid_1's rmse: 0.446907
Early stopping, best iteration is:
[354]	training's rmse: 0.34038	valid_1's rmse: 0.446728

Quick evaluation of our regression model.


In [7]:
def mape_score(y_true, y_score):
    """Mean Absolute Percentage Error (MAPE)."""
    mask = y_true != 0
    y_true = y_true[mask]
    y_score = y_score[mask]
    mape = np.abs(y_true - y_score) / y_true
    return np.mean(mape)


def compute_score(model, dataset, verbose=True):
    """
    Computes the model evaluation score (r2, rmse, mape) for the
    input model and dataset.
    """
    y_true = dataset.get_label()
    y_score = model.predict(dataset.get_data())

    r2 = round(metrics.r2_score(y_true, y_score), 3)
    rmse = round(np.sqrt(metrics.mean_squared_error(y_true, y_score)), 3)
    mape = round(mape_score(y_true, y_score), 3)
    if verbose:
        print('r2: ', r2)
        print('rmse: ', rmse)
        print('mape: ', mape)

    return r2, rmse, mape

In [8]:
r2, rmse, mape = compute_score(model, dtest)


r2:  0.85
rmse:  0.447
mape:  0.166

Saves the trained model under the app folder.


In [9]:
save_path = os.path.join('app', 'model.txt')
model.save_model(save_path, num_iteration=model.best_iteration)

Ensure the prediction between the model and the saved model matches. Here we pass in the whole test set.


In [10]:
predictions = model.predict(dtest.get_data())
predictions


Out[10]:
array([2.2418686 , 1.00175827, 1.48855899, ..., 0.75053102, 1.98354469,
       3.65037742])

In [11]:
model_loaded = lgb.Booster(model_file=save_path)
predictions = model_loaded.predict(dtest.get_data())
predictions


Out[11]:
array([2.2418686 , 1.00175827, 1.48855899, ..., 0.75053102, 1.98354469,
       3.65037742])

We can also perform prediction for a single record. The caveat here is that .predict expects a 2d array, hence for single record prediction, we need to reshape it to 2d first.


In [12]:
row = dtest.get_data()[0].reshape(1, -1)
row


Out[12]:
array([[ 3.79170000e+00,  4.00000000e+01,  4.95979899e+00,
         1.03015075e+00,  1.03900000e+03,  2.61055276e+00,
         3.82400000e+01, -1.22640000e+02]])

In [13]:
model.predict(row)


Out[13]:
array([2.2418686])

Calling the API

Before proceeding on to this section, we need to create the service first. Either follow the Docker Container section in the README to host the service locally through a container or power through the Azure Kubernetes Cluster section to host the service on Azure Kubernetes Cluster.

Once we host the service, and can test it using the request library.


In [14]:
import json
import requests

In [15]:
# data = {
#   "MedInc": 0,
#   "HouseAge": 0,
#   "AveRooms": 0,
#   "AveBedrms": 0,
#   "Population": 0,
#   "AveOccup": 0,
#   "Latitude": 0,
#   "Longitude": 0
# }

data = {feature_name: value for feature_name, value in zip(cal_housing.feature_names, dtest.get_data()[0])}
data


Out[15]:
{'MedInc': 3.7917,
 'HouseAge': 40.0,
 'AveRooms': 4.959798994974874,
 'AveBedrms': 1.0301507537688441,
 'Population': 1039.0,
 'AveOccup': 2.6105527638190953,
 'Latitude': 38.24,
 'Longitude': -122.64}

Change the url accordingly. And pass our features as a json body.


In [16]:
# e.g. for local deployment
# url = 'http://127.0.0.1:8000/predict'

# e.g. for local docker deployment
# url = 'http://0.0.0.0:80/predict'

# e.g. for azure kubernetes cluster deployment
url = 'http://13.91.195.109:80/predict'

raw_response = requests.post(url, data=json.dumps(data))
raw_response.raise_for_status()
response = json.loads(raw_response.text)
response


Out[16]:
{'score': 2.2418686032176747}

In [17]:
%%timeit
# speed benchmark of the model
model.predict(row)[0]


157 µs ± 10.3 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [18]:
%%timeit
# speed benchmark of the model hosted as a service
raw_response = requests.post(url, data=json.dumps(data))
raw_response.raise_for_status()
response = json.loads(raw_response.text)
response


38.3 ms ± 882 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

We've also implemented the endpoint for supporting batch calls, i.e. to get the scores for multiple records in a single call.


In [19]:
payloads = []
for data in dtest.get_data()[:3]:
    payload = {feature_name: value for feature_name, value in zip(cal_housing.feature_names, data)}
    payloads.append(payload)

payloads


Out[19]:
[{'MedInc': 3.7917,
  'HouseAge': 40.0,
  'AveRooms': 4.959798994974874,
  'AveBedrms': 1.0301507537688441,
  'Population': 1039.0,
  'AveOccup': 2.6105527638190953,
  'Latitude': 38.24,
  'Longitude': -122.64},
 {'MedInc': 4.0217,
  'HouseAge': 9.0,
  'AveRooms': 5.804577464788732,
  'AveBedrms': 1.0,
  'Population': 1749.0,
  'AveOccup': 3.079225352112676,
  'Latitude': 36.09,
  'Longitude': -119.05},
 {'MedInc': 4.0882,
  'HouseAge': 12.0,
  'AveRooms': 5.36036036036036,
  'AveBedrms': 1.0705705705705706,
  'Population': 3321.0,
  'AveOccup': 4.986486486486487,
  'Latitude': 32.85,
  'Longitude': -116.98}]

In [20]:
url = 'http://13.91.195.109:80/batch/predict'
raw_response = requests.post(url, data=json.dumps(payloads))
raw_response.raise_for_status()
response = json.loads(raw_response.text)
response


Out[20]:
{'scores': [2.2418686032176747, 1.001758270797447, 1.4885589912546886]}

In [21]:
%%timeit
# speed benchmark of the model hosted as a service using the batch endpoint
raw_response = requests.post(url, data=json.dumps(payloads))
raw_response.raise_for_status()
response = json.loads(raw_response.text)
response


39.9 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)