This notebook demonstrates how to use Open Data Cube utilities to cluster geospatial data.
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import datacube
import datetime as dt
import xarray as xr
import numpy as np
from utils.data_cube_utilities.data_access_api import DataAccessApi
from utils.data_cube_utilities.plotter_utils import figure_ratio
In [2]:
api = DataAccessApi()
dc = api.dc
Examine available products
In [3]:
# Get available products
products_info = dc.list_products()
In [4]:
# List LANDSAT 7 products
print("LANDSAT 7 Products:")
products_info[["platform", "name"]][products_info.platform == "LANDSAT_7"]
Out[4]:
In [5]:
# List LANDSAT 8 products
print("LANDSAT 8 Products:")
products_info[["platform", "name"]][products_info.platform == "LANDSAT_8"]
Out[5]:
Choose product and platform
In [6]:
product = "ls8_usgs_sr_scene"
platform = "LANDSAT_8"
In [7]:
from utils.data_cube_utilities.dc_load import get_product_extents
full_lat, full_lon, min_max_dates = get_product_extents(api, platform, product)
print("{}:".format(platform))
print("Lat bounds:", full_lat)
print("Lon bounds:", full_lon)
print("Time bounds:", min_max_dates)
In [8]:
from utils.data_cube_utilities.dc_display_map import display_map
# Display the total shared area available for these datacube products.
display_map(latitude = full_lat,longitude = full_lon)
Out[8]:
Specify start and end dates in the same order as platforms and products
In [9]:
# from datetime import datetime
# start_date, end_date = (datetime(2010,1,1), datetime(2011,1,1))
# start_date, end_date = dt.datetime(2014,1,1), dt.datetime(2016,1,1)
start_date, end_date = dt.datetime(2014,9,1), dt.datetime(2015,3,1)
date_range = (start_date, end_date)
Specify an area to analyze
In [10]:
# Specify latitude and longitude bounds of an interesting area within the full extents
# Vietnam
# lat_small = (9.8, 9.85) # Area #1
# lon_small = (105.1, 105.15) # Area #1
# Ghana
# Weija Reservoir - North
lat_small = (5.5974, 5.6270)
lon_small = (-0.3900, -0.3371)
Visualize the selected area
In [11]:
display_map(latitude = lat_small,longitude = lon_small)
Out[11]:
Create geographic chunks for efficient processing
In [12]:
from utils.data_cube_utilities.dc_chunker import create_geographic_chunks
geographic_chunks = create_geographic_chunks(
latitude=lat_small,
longitude=lon_small,
geographic_chunk_size=.05)
Create a geomedian composite
In [13]:
from utils.data_cube_utilities.clean_mask import landsat_qa_clean_mask
from utils.data_cube_utilities.dc_mosaic import create_hdmedians_multiple_band_mosaic
measurements = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2', 'pixel_qa']
product_chunks = []
for index, chunk in enumerate(geographic_chunks):
data = dc.load(measurements = measurements,
time = date_range,
platform = platform,
product = product,
longitude=chunk['longitude'],
latitude=chunk['latitude'])
# Mask out clouds and scan lines.
clean_mask = landsat_qa_clean_mask(data, platform)
# Create the mosaic.
product_chunks.append(create_hdmedians_multiple_band_mosaic(data, clean_mask=clean_mask, dtype=np.float32))
Combine the chunks to produce the final mosaic
In [14]:
from utils.data_cube_utilities.dc_chunker import combine_geographic_chunks
final_composite = combine_geographic_chunks(product_chunks)
In [15]:
from utils.data_cube_utilities.dc_rgb import rgb
fig = plt.figure(figsize=figure_ratio(final_composite, fixed_width=8, fixed_height=8))
rgb(final_composite, bands=['red', 'green', 'blue'], fig=fig,
use_data_min=True, use_data_max=True)
plt.title('True Color Geomedian Composite', fontsize=16)
plt.show()
In [16]:
fig = plt.figure(figsize=figure_ratio(final_composite, fixed_width=8, fixed_height=8))
rgb(final_composite, bands=['swir1', 'nir', 'red'], fig=fig,
use_data_min=True, use_data_max=True)
plt.title('False Color Geomedian Composite', fontsize=16)
plt.show()
In [17]:
final_composite.swir1.plot(figsize = figure_ratio(final_composite, fixed_width=10,
fixed_height=10), cmap = 'magma')
plt.title('SWIR1 Composite', fontsize=16)
plt.show()
In [18]:
from utils.data_cube_utilities.import_export import export_slice_to_geotiff
import os
geotiff_dir = 'output/geotiffs/Clustering_Notebook'
if not os.path.exists(geotiff_dir):
os.makedirs(geotiff_dir)
export_slice_to_geotiff(final_composite, '{}/final_composite.tif'.format(geotiff_dir))
In [19]:
from utils.data_cube_utilities.dc_load import xr_scale_res
In [20]:
from utils.data_cube_utilities.dc_clustering import kmeans_cluster_dataset, get_frequency_counts
# Bands used for clustering
cluster_bands = ['red', 'green', 'blue', 'swir1']
classification_4 = kmeans_cluster_dataset(final_composite, cluster_bands, n_clusters=4)
freq_counts_4 = get_frequency_counts(classification_4)
classification_8 = kmeans_cluster_dataset(final_composite, cluster_bands, n_clusters=8)
freq_counts_8 = get_frequency_counts(classification_8)
classification_12 = kmeans_cluster_dataset(final_composite, cluster_bands, n_clusters=12)
freq_counts_12 = get_frequency_counts(classification_12)
In [21]:
# Define standard formatting.
def get_figsize_geospatial(fixed_width=8, fixed_height=14,
num_cols=1, num_rows=1):
return figure_ratio(final_composite,
fixed_width=fixed_width, fixed_height=fixed_height,
num_cols=num_cols, num_rows=num_rows)
xarray_imshow_params = dict(use_colorbar=False, use_legend=True,
fig_kwargs=dict(dpi=120, figsize=get_figsize_geospatial()))
In [22]:
from utils.data_cube_utilities.plotter_utils import xarray_imshow
for class_num, freq, fractional_freq in freq_counts_4:
# The `*_cluster_dataset()` functions set -1 as the cluster number for "rows" with missing data.
class_num, freq = int(class_num), int(freq)
class_mem_str = "in class {:d}".format(class_num) if class_num != -1 else "that had missing data"
print("There were {:d} data points {}, comprising {:.2%} "\
"of all data points.".format(int(freq), class_mem_str,
fractional_freq))
legend_labels = {v:"Cluster {}".format(v) if v != -1 else "Missing Data" for v in np.unique(classification_4)}
xarray_imshow(classification_4, **xarray_imshow_params, legend_labels=legend_labels)
plt.show()
In [23]:
for class_num, freq, fractional_freq in freq_counts_8:
# The `*_cluster_dataset()` functions set -1 as the cluster number for "rows" with missing data.
class_num, freq = int(class_num), int(freq)
class_mem_str = "in class {:d}".format(class_num) if class_num != -1 else "that had missing data"
print("There were {:d} data points {}, comprising {:.2%} "\
"of all data points.".format(int(freq), class_mem_str,
fractional_freq))
legend_labels = {v:"Cluster {}".format(v) if v != -1 else "Missing Data" for v in np.unique(classification_8)}
xarray_imshow(classification_8, **xarray_imshow_params, legend_labels=legend_labels)
plt.show()
In [24]:
for class_num, freq, fractional_freq in freq_counts_12:
# The `*_cluster_dataset()` functions set -1 as the cluster number for "rows" with missing data.
class_num, freq = int(class_num), int(freq)
class_mem_str = "in class {:d}".format(class_num) if class_num != -1 else "that had missing data"
print("There were {:d} data points {}, comprising {:.2%} "\
"of all data points.".format(int(freq), class_mem_str,
fractional_freq))
legend_labels = {v:"Cluster {}".format(v) if v != -1 else "Missing Data" for v in np.unique(classification_12)}
xarray_imshow(classification_12, **xarray_imshow_params, legend_labels=legend_labels)
plt.show()
In [25]:
from utils.data_cube_utilities.import_export import export_slice_to_geotiff
if not os.path.exists(geotiff_dir):
os.makedirs(geotiff_dir)
output_kmeans_cluster4_file_path = os.path.join(geotiff_dir, "cluster4_kmeans.tif")
output_kmeans_cluster8_file_path = os.path.join(geotiff_dir, "cluster8_kmeans.tif")
output_kmeans_cluster12_file_path = os.path.join(geotiff_dir, "cluster12_kmeans.tif")
export_slice_to_geotiff(classification_4.to_dataset(name='classification'),
output_kmeans_cluster4_file_path)
export_slice_to_geotiff(classification_8.to_dataset(name='classification'),
output_kmeans_cluster8_file_path)
export_slice_to_geotiff(classification_12.to_dataset(name='classification'),
output_kmeans_cluster12_file_path)