In [0]:
#@title Copyright 2020 Google LLC. { display-mode: "form" }
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Introduction

Logistic regression

Logistic regression is a classical machine learning method to estimate the probability of an event occuring (sometimes called the "risk"). Specfically, the probability is modeled as a sigmoid function of a linear combination of inputs. This can be implemented as a very simple neural network with a single trainable layer.

Here, the event being modeled is deforestation in 2016. If a pixel is labeled as deforesetation in 2016 according to the Hansen Global Forest Change dataset, the event occurred with probability 1. The probability is zero otherwise. The input variables (i.e. the predictors of this event) are the pixel values of two Landsat 8 surface reflectance median composites, from 2015 and 2017, assumed to represent before and after conditions.

The model will be hosted on Google AI Platform and used in Earth Engine for interactive prediction from an ee.Model.fromAIPlatformPredictor. See this example notebook for background on hosted models.

Running this demo may incur charges to your Google Cloud Account!

Setup software libraries

Import software libraries and/or authenticate as necessary.

Authenticate to Colab and Cloud

This should be the same account you use to login to Earth Engine.


In [0]:
from google.colab import auth
auth.authenticate_user()

Authenticate to Earth Engine

This should be the same account you used to login to Cloud previously.


In [0]:
import ee
ee.Authenticate()
ee.Initialize()

Test the TensorFlow installation


In [0]:
import tensorflow as tf
print(tf.__version__)

Test the Folium installation


In [0]:
import folium
print(folium.__version__)

Define variables


In [0]:
# REPLACE WITH YOUR CLOUD PROJECT!
PROJECT = 'your-project'

# Output bucket for trained models.  You must be able to write into this bucket.
OUTPUT_BUCKET = 'your-bucket'

# Cloud Storage bucket with training and testing datasets.
DATA_BUCKET = 'ee-docs-demos'

# Training and testing dataset file names in the Cloud Storage bucket.
TRAIN_FILE_PREFIX = 'logistic_demo_training'
TEST_FILE_PREFIX = 'logistic_demo_testing'
file_extension = '.tfrecord.gz'
TRAIN_FILE_PATH = 'gs://' + DATA_BUCKET + '/' + TRAIN_FILE_PREFIX + file_extension
TEST_FILE_PATH = 'gs://' + DATA_BUCKET + '/' + TEST_FILE_PREFIX + file_extension

# The labels, consecutive integer indices starting from zero, are stored in
# this property, set on each point.
LABEL = 'loss16'
# Number of label values, i.e. number of classes in the classification.
N_CLASSES = 3

# Use Landsat 8 surface reflectance data for predictors.
L8SR = ee.ImageCollection('LANDSAT/LC08/C01/T1_SR')
# Use these bands for prediction.
OPTICAL_BANDS = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7']
THERMAL_BANDS = ['B10', 'B11']
BEFORE_BANDS = OPTICAL_BANDS + THERMAL_BANDS
AFTER_BANDS = [str(s) + '_1' for s in BEFORE_BANDS]
BANDS = BEFORE_BANDS + AFTER_BANDS

# Forest loss in 2016 is what we want to predict.
IMAGE = ee.Image('UMD/hansen/global_forest_change_2018_v1_6')
LOSS16 = IMAGE.select('lossyear').eq(16).rename(LABEL)

# Study area.  Mostly Brazil.
GEOMETRY = ee.Geometry.Polygon(
        [[[-71.96531166607349, 0.24565390557980268],
          [-71.96531166607349, -17.07400853625319],
          [-40.32468666607349, -17.07400853625319],
          [-40.32468666607349, 0.24565390557980268]]], None, False)

# These names are used to specify properties in the export of training/testing
# data and to define the mapping between names and data when reading from
# the TFRecord file into a tf.data.Dataset.
FEATURE_NAMES = list(BANDS)
FEATURE_NAMES.append(LABEL)

# List of fixed-length features, all of which are float32.
columns = [
  tf.io.FixedLenFeature(shape=[1], dtype=tf.float32) for k in FEATURE_NAMES
]

# Dictionary with feature names as keys, fixed-length features as values.
FEATURES_DICT = dict(zip(FEATURE_NAMES, columns))

# Where to save the trained model.
MODEL_DIR = 'gs://' + OUTPUT_BUCKET + '/logistic_demo_model'
# Where to save the EEified model.
EEIFIED_DIR = 'gs://' + OUTPUT_BUCKET + '/logistic_demo_eeified'

# Name of the AI Platform model to be hosted.
MODEL_NAME = 'logistic_demo_model'
# Version of the AI Platform model to be hosted.
VERSION_NAME = 'v0'

Generate training data

This is a multi-step process. First, export the image that contains the prediction bands. When that export completes (several hours in this example), it can be reloaded and sampled to generate training and testing datasets. The second step is to export the traning and testing tables to TFRecord files in Cloud Storage (also several hours).


In [0]:
# Cloud masking function.
def maskL8sr(image):
  cloudShadowBitMask = ee.Number(2).pow(3).int()
  cloudsBitMask = ee.Number(2).pow(5).int()
  qa = image.select('pixel_qa')
  mask1 = qa.bitwiseAnd(cloudShadowBitMask).eq(0).And(
    qa.bitwiseAnd(cloudsBitMask).eq(0))
  mask2 = image.mask().reduce('min')
  mask3 = image.select(OPTICAL_BANDS).gt(0).And(
          image.select(OPTICAL_BANDS).lt(10000)).reduce('min')
  mask = mask1.And(mask2).And(mask3)
  return image.select(OPTICAL_BANDS).divide(10000).addBands(
          image.select(THERMAL_BANDS).divide(10).clamp(273.15, 373.15)
            .subtract(273.15).divide(100)).updateMask(mask)

# Make "before" and "after" composites.
composite1 = L8SR.filterDate(
    '2015-01-01', '2016-01-01').map(maskL8sr).median()
composite2 = L8SR.filterDate(
    '2016-12-31', '2017-12-31').map(maskL8sr).median()

stack = composite1.addBands(composite2).float()

export_image = 'projects/google/logistic_demo_image'

image_task = ee.batch.Export.image.toAsset(
  image = stack, 
  description = 'logistic_demo_image', 
  assetId = export_image, 
  region = GEOMETRY,
  scale = 30,
  maxPixels = 1e10
)

First, export the image stack that contains the predictors.


In [0]:
image_task.start()

Wait until the image export is completed, then sample the exported image.


In [0]:
sample = ee.Image(export_image).addBands(LOSS16).stratifiedSample(
  numPoints = 10000,
  classBand = LABEL,
  region = GEOMETRY,
  scale = 30,
  tileScale = 8
)

randomized = sample.randomColumn()
training = randomized.filter(ee.Filter.lt('random', 0.7))
testing = randomized.filter(ee.Filter.gte('random', 0.7))

train_task = ee.batch.Export.table.toCloudStorage(
  collection = training,
  description = TRAIN_FILE_PREFIX,
  bucket = OUTPUT_BUCKET,
  fileFormat = 'TFRecord'
)

test_task = ee.batch.Export.table.toCloudStorage(
  collection = testing,
  description = TEST_FILE_PREFIX,
  bucket = OUTPUT_BUCKET,
  fileFormat = 'TFRecord'
)

Export the training and testing tables. This also takes a few hours.


In [0]:
train_task.start()
test_task.start()

Parse the exported datasets

Now we need to make a parsing function for the data in the TFRecord files. The data comes in flattened 2D arrays per record and we want to use the first part of the array for input to the model and the last element of the array as the class label. The parsing function reads data from a serialized Example proto (i.e. example.proto) into a dictionary in which the keys are the feature names and the values are the tensors storing the value of the features for that example. (Learn more about parsing Example protocol buffer messages).


In [0]:
def parse_tfrecord(example_proto):
  """The parsing function.

  Read a serialized example into the structure defined by FEATURES_DICT.

  Args:
    example_proto: a serialized Example.

  Returns:
    A tuple of the predictors dictionary and the label, cast to an `int32`.
  """
  parsed_features = tf.io.parse_single_example(example_proto, FEATURES_DICT)
  labels = parsed_features.pop(LABEL)
  return parsed_features, tf.cast(labels, tf.int32)


def to_tuple(inputs, label):
  """ Convert inputs to a tuple.

  Note that the inputs must be a tuple of tensors in the right shape.

  Args:
    dict: a dictionary of tensors keyed by input name.
    label: a tensor storing the response variable.

  Returns:
    A tuple of tensors: (predictors, label).
  """
  # Values in the tensor are ordered by the list of predictors.
  predictors = [inputs.get(k) for k in BANDS]
  return (tf.expand_dims(tf.transpose(predictors), 1),
          tf.expand_dims(tf.expand_dims(label, 1), 1))

In [0]:
# Load datasets from the files.
train_dataset = tf.data.TFRecordDataset(TRAIN_FILE_PATH, compression_type='GZIP')
test_dataset = tf.data.TFRecordDataset(TEST_FILE_PATH, compression_type='GZIP')

# Compute the size of the shuffle buffer.  We can get away with this
# because it's a small dataset, but watch out with larger datasets.
train_size = 0
for _ in iter(train_dataset):
  train_size+=1

batch_size = 8

# Map the functions over the datasets to parse and convert to tuples.
train_dataset = train_dataset.map(parse_tfrecord, num_parallel_calls=4)
train_dataset = train_dataset.map(to_tuple, num_parallel_calls=4)
train_dataset = train_dataset.shuffle(train_size).batch(batch_size)

test_dataset = test_dataset.map(parse_tfrecord, num_parallel_calls=4)
test_dataset = test_dataset.map(to_tuple, num_parallel_calls=4)
test_dataset = test_dataset.batch(batch_size)

# Print the first parsed record to check.
from pprint import pprint
pprint(iter(train_dataset).next())

Note that each record of the parsed dataset contains a tuple. The first element of the tuple is a dictionary with bands for keys and the numeric value of the bands for values. The second element of the tuple is the class label, which in this case is an indicator variable that is one if deforestation happened, zero othewise.

Create the Keras model

This model is intended to represent traditional logistic regression, the parameters of which are estimated through maximum likelihood. Specifically, the probability of an event is represented as the sigmoid of a linear function of the predictors. Training or fitting the model consists of finding the parameters of the linear function that maximize the likelihood function. This is implemented in Keras by defining a model with a single trainable layer, a sigmoid activation on the output, and a crossentropy loss function. Note that the only trainable layer is convolutional, with a 1x1 kernel, so that Earth Engine can apply the model in each pixel. To fit the model, a Stochastic Gradient Descent (SGD) optimizer is used. This differs somewhat from traditional fitting of logistic regression models in that stocahsticity is introduced by using mini-batches to estimate the gradient.


In [0]:
from tensorflow import keras

# Define the layers in the model.
model = tf.keras.models.Sequential([
  tf.keras.layers.Input((1, 1, len(BANDS))),
  tf.keras.layers.Conv2D(1, (1,1), activation='sigmoid')
])

# Compile the model with the specified loss function.
model.compile(optimizer=tf.keras.optimizers.SGD(momentum=0.9),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Fit the model to the training data.
model.fit(x=train_dataset, 
          epochs=20,
          validation_data=test_dataset)

Save the trained model

Save the trained model to tf.saved_model format in your cloud storage bucket.


In [0]:
model.save(MODEL_DIR, save_format='tf')

EEification

The first part of the code is just to get (and SET) input and output names. Keep the input name of 'array', which is how you'll pass data into the model (as an array image).


In [0]:
from tensorflow.python.tools import saved_model_utils

meta_graph_def = saved_model_utils.get_meta_graph_def(MODEL_DIR, 'serve')
inputs = meta_graph_def.signature_def['serving_default'].inputs
outputs = meta_graph_def.signature_def['serving_default'].outputs

# Just get the first thing(s) from the serving signature def.  i.e. this
# model only has a single input and a single output.
input_name = None
for k,v in inputs.items():
  input_name = v.name
  break

output_name = None
for k,v in outputs.items():
  output_name = v.name
  break

# Make a dictionary that maps Earth Engine outputs and inputs to 
# AI Platform inputs and outputs, respectively.
import json
input_dict = "'" + json.dumps({input_name: "array"}) + "'"
output_dict = "'" + json.dumps({output_name: "output"}) + "'"
print(input_dict)
print(output_dict)

Run the EEifier

Use the command line to set your Cloud project and then run the eeifier.


In [0]:
!earthengine set_project {PROJECT}
!earthengine model prepare --source_dir {MODEL_DIR} --dest_dir {EEIFIED_DIR} --input {input_dict} --output {output_dict}

Deploy and host the EEified model on AI Platform

If you change anything about the model, you'll need to re-EEify it and create a new version!


In [0]:
!gcloud ai-platform models create {MODEL_NAME} --project {PROJECT}
!gcloud ai-platform versions create {VERSION_NAME} \
  --project {PROJECT} \
  --model {MODEL_NAME} \
  --origin {EEIFIED_DIR} \
  --framework "TENSORFLOW" \
  --runtime-version=2.1 \
  --python-version=3.7

Connect to the hosted model from Earth Engine

Now that the model is hosted on AI Platform, point Earth Engine to it and make predictions. These predictions can be thresholded for a rudimentary deforestation detector. Visualize the after imagery, the reference data and the predictions.


In [0]:
# Turn into an array image for input to the model.
array_image = stack.select(BANDS).float().toArray()

# Point to the model hosted on AI Platform.
model = ee.Model.fromAiPlatformPredictor(
    projectName=PROJECT,
    modelName=MODEL_NAME,
    version=VERSION_NAME,
    # Can be anything, but don't make it too big.
    inputTileSize=[8, 8],
    # Keep this the same as your training data.
    proj=ee.Projection('EPSG:4326').atScale(30),
    fixInputProj=True,
    # Note the names here need to match what you specified in the
    # output dictionary you passed to the EEifier.
    outputBands={'output': {
        'type': ee.PixelType.float(),
        'dimensions': 1
      }
    },
)

# Output probability.
predictions = model.predictImage(array_image).arrayGet([0])

# Back-of-the-envelope decision rule.
predicted = predictions.gt(0.7).selfMask()

# Training data for comparison.
reference = LOSS16.selfMask()

# Get map IDs for display in folium.
probability_vis = {'min': 0, 'max': 1}
probability_mapid = predictions.getMapId(probability_vis)

predicted_vis = {'palette': 'red'}
predicted_mapid = predicted.getMapId(predicted_vis)

reference_vis = {'palette': 'orange'}
reference_mapid = reference.getMapId(reference_vis)

image_vis = {'bands': ['B4', 'B3', 'B2'], 'min': 0, 'max': 0.3}
image_mapid = composite2.getMapId(image_vis)

# Visualize the input imagery and the predictions.
map = folium.Map(location=[-9.1, -62.3], zoom_start=11)
folium.TileLayer(
  tiles=image_mapid['tile_fetcher'].url_format,
  attr='Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
  overlay=True,
  name='image',
).add_to(map)
folium.TileLayer(
  tiles=probability_mapid['tile_fetcher'].url_format,
  attr='Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
  overlay=True,
  name='probability',
).add_to(map)
folium.TileLayer(
  tiles=predicted_mapid['tile_fetcher'].url_format,
  attr='Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
  overlay=True,
  name='predicted',
).add_to(map)
folium.TileLayer(
  tiles=reference_mapid['tile_fetcher'].url_format,
  attr='Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
  overlay=True,
  name='reference',
).add_to(map)
map.add_child(folium.LayerControl())
map