In [ ]:

    
from sys import path
path.append("../")
%matplotlib inline

Read in the data



In [ ]:

    
import pandas as pd



In [ ]:

    
df = pd.read_csv('data.csv')
df.head()

See Label Distribution



In [ ]:

    
df.groupby("LandUse").size()



In [ ]:

    
from matplotlib import pyplot
import seaborn as sns

fig, ax = pyplot.subplots(figsize=(15,5))
sns.countplot(x="LandUse",data=df, palette="Greens_d");

Classification Re-org

The following classes do not have enough training examples to guarantee high enough variance:

Forestry
Fruittrees
Nativeforest
Stubble
Water

My suggestion is reducing features/label-set to the following:

Natural Grassland
Summercrops
Prairie

as well as a grouping of Forestry, FruitTrees, and Native Forest under:

Forest

then grouping other labels under a class called:

Misc

Rebuild Classes



In [ ]:

    
def combine_classes_on_dataframe(_df, classes = [], combine_to = "default", column_name = "LandUse"):
    df_new = df.copy()  
    df_new[column_name].update(df_new[column_name].map(lambda x: combine_to if x in classes else x ))
    return df_new



In [ ]:

    
df_new = df.copy()  
df_new['LandUse'].update(df_new['LandUse'].map(lambda x: "Forest" if x in ["Forestry","Fruittrees","Nativeforest"] else x ))
df_new['LandUse'].update(df_new['LandUse'].map(lambda x: "Misc" if x  not in ["Forest","Prairie","Summercrops","Naturalgrassland"] else x ))



In [ ]:

    
df_new.groupby("LandUse").size()



In [ ]:

    
fig, ax = pyplot.subplots(figsize=(15,5))
sns.countplot(x="LandUse",data=df_new, palette="Greens_d");

Import Training Data



In [ ]:

    
import datacube
dc = datacube.Datacube(config = "/home/localuser/.datacube.conf")

LIsting products with 'uruguay' in its name



In [ ]:

    
dc.list_products()[dc.list_products()['name'].str.contains("uruguay")]

Load the data



In [ ]:

    
params = dict(platform = 'LANDSAT_8',
              product   = 'ls8_lasrc_uruguay',
              latitude = (-34.44988376, -34.096445),
              longitude = (-56.29119062, -55.24653668),
              time = ('2016-01-01', '2017-01-01'), 
              measurements = ['red', 'green', 'blue', 'nir', 'swir1', 'swir2', 'pixel_qa'])



In [ ]:

    
dataset = dc.load(**params)

Composite Data



In [ ]:

    
import numpy as np
import datetime 
from utils.data_cube_utilities.dc_mosaic import ls8_unpack_qa
from utils.data_cube_utilities.dc_mosaic import create_median_mosaic

def clean_mask_ls8(ds):
    water_mask = ls8_unpack_qa(ds.pixel_qa, cover_type = "water")
    clear_mask = ls8_unpack_qa(ds.pixel_qa, cover_type = "clear")
    clean_mask = np.logical_or(water_mask, clear_mask)
    return clean_mask

def compress_on_time(_ds, date_range):
    subset = _ds.sel(time = slice(*date_range)) # select a smaller date-range 
    
    water_mask = ls8_unpack_qa(subset.pixel_qa, cover_type = "water")
    clear_mask = ls8_unpack_qa(subset.pixel_qa, cover_type = "clear")
    clean_mask = np.logical_or(water_mask, clear_mask)
    
    return create_median_mosaic(subset, clean_mask = clean_mask)



In [ ]:

    
dataset_composite = compress_on_time(dataset, ('2016-01-01', '2017-01-01'))



In [ ]:

    
dataset_composite

Normalization



In [ ]:

    
dataset_composite = dataset_composite.drop('pixel_qa')



In [ ]:

    
dataset_composite_normalalized = dataset_composite.copy(deep = True)
for band_name in dataset_composite_normalalized.data_vars:  
    arr = dataset_composite_normalalized[band_name].values
    arr[arr > 10000] = 10000
    arr[arr < 0] = 10000



In [ ]:

    
dataset_composite_normalalized = dataset_composite_normalalized  * (1/10000)



In [ ]:

    
dataset_composite_normalalized

Feature Engineering

The default features for a Landsat based classifier are:

red
green
blue
nir
swir1
swir1

High order relationships between these features can also be used inputs to a random forest classifier. The following indices are already used in Landsat analysis and correlate with the existence of several land classes like water, chlorophyll, sediment/top-soil:

NDVI
NDVI Covariance
NDWI
NDBI
SCI
CCCI
CVI
PNDVI

These are not the best features to include but are easy to source. The code below should serve as a template of adding new features to the random forest feature classifier.

Plotting Utils
Plotting utiltiies to help visualize features.



In [ ]:

    
import matplotlib.pyplot as plt

def plot_xarray(data_array):
    fig = plt.figure(figsize = (15,5))
    plt.axis('equal')
    data_array.plot()



In [ ]:

    
def select_pixels_from_dataarray_using_pandas_array(ds,
                                                    pandas_array = None,
                                                    latitude_name = None,
                                                    longitude_name = None,
                                                    land_cover_name = None):
    
    landcover_names = list(pandas_array.groupby(land_cover_name).size().index)
    training_dict   = {name: [] for name in landcover_names}
    pandas_rows     = [x[1] for x in pandas_array.iterrows()]

    for point_data in pandas_rows:
        training_dict[point_data[land_cover_name]].append(
            float(ds.sel(latitude  = point_data[latitude_name],
                         longitude = point_data[longitude_name],
                         method = 'nearest')))
    data_list = []
    target_list = []
    ordered_class_names = []
    for i, name in enumerate(training_dict.keys()):
        ordered_class_names.append(name)
        data_list += training_dict[name]
        target_list += [i]*len(training_dict[name])

    classes = list(map(lambda class_name_index: ordered_class_names[class_name_index], target_list))
    
    return pd.DataFrame({"data": data_list,
                         "landcover": classes})

from functools import partial    

select_pixels = partial(select_pixels_from_dataarray_using_pandas_array,
                        pandas_array   = df_new,
                        latitude_name  = "Latitude",
                        longitude_name = "Longitude",
                        land_cover_name = "LandUse")



In [ ]:

    
import seaborn as sns  

def jitter_plot(ds, name):
    data = select_pixels(ds)
    data = data.rename(index = str, columns={"data":name})
    plt.figure(figsize = (15,5))
    ax = sns.stripplot(x=name, y="landcover", data=data , jitter=True, linewidth=1)

Features

NDVI



In [ ]:

    
def NDVI(dataset):
    return (dataset.nir - dataset.red)/(dataset.nir + dataset.red).rename("NDVI")



In [ ]:

    
plot_xarray( NDVI(dataset_composite) )



In [ ]:

    
jitter_plot(NDVI(dataset_composite), name = "NDVI")

NDVI Covariance



In [ ]:

    
import xarray as xr  

def covariance(da:xr.DataArray):
    return da.std(dim = "time")/da.mean(dim = "time")



In [ ]:

    
def covariance_NDVI(ds, mask = None):
    ## TODO: Lift masking out of function 
#     mask = np.ones(ds[list(ds.data_vars.keys())[0]].values.shape).astype(bool) if mask is None else mask
    ds_ndvi = NDVI(ds)    
    ds_ndvi = ds_ndvi.where(mask)
    return covariance(ds_ndvi)



In [ ]:

    
import numpy as np 
def finite_histogram(data_array, *args, **kwargs):
    x = data_array.values.copy()
    x = x[~np.isnan(x)]
    x = x[np.isfinite(x)]
    
    data = plt.hist(x,*args, **kwargs)



In [ ]:

    
ndvi_cov = covariance_NDVI(dataset,
                           mask = clean_mask_ls8(dataset))

NDVI coefficient of variance histogram



In [ ]:

    
fig = plt.figure(figsize = (15,5))
plt.imshow(ndvi_cov.values,
           vmin=0,
           vmax=1)



In [ ]:

    
plt.figure(figsize = (15,5))
finite_histogram(ndvi_cov, bins = 2000, range = (0,1))



In [ ]:

    
jitter_plot(ndvi_cov, name = "coefficient_of_variance_NDVI")

NBR



In [ ]:

    
def NBR(dataset):
    return ((dataset.nir - dataset.swir2) / (dataset.swir2 + dataset.nir)).rename("NBR")



In [ ]:

    
plot_xarray(  NBR(dataset_composite)  )



In [ ]:

    
jitter_plot( NBR(dataset_composite), name = "NBR")

NDBI



In [ ]:

    
def NDBI(dataset):
    return (( dataset.swir1 - dataset.nir ) / (dataset.swir1 + dataset.nir)).rename("NDBI")



In [ ]:

    
plot_xarray( NDBI(dataset_composite) )



In [ ]:

    
jitter_plot( NDBI(dataset_composite), name = "NDBI")

NDWI



In [ ]:

    
def NDWI_1(dataset):
    return (dataset.nir - dataset.swir1)/(dataset.nir + dataset.swir1).rename("NDWI_1")



In [ ]:

    
plot_xarray( NDWI_1(dataset_composite) )



In [ ]:

    
jitter_plot( NDWI_1(dataset_composite), name = "NDWI_1")



In [ ]:

    
def NDWI_2(dataset):
    return (dataset.green - dataset.nir)/(dataset.green + dataset.nir).rename("NDWI_2")



In [ ]:

    
plot_xarray( NDWI_2(dataset_composite) )



In [ ]:

    
jitter_plot( NDWI_2(dataset_composite), name = "NDWI_2")

SCI



In [ ]:

    
def SCI(dataset):
    return ((dataset.swir1 - dataset.nir)/(dataset.swir1 + dataset.nir)).rename("SCI")



In [ ]:

    
plot_xarray( SCI(dataset_composite) )



In [ ]:

    
jitter_plot( SCI(dataset_composite), name = "SCI")

NBR2



In [ ]:

    
def NBR2(dataset):
    return (dataset.swir1 - dataset.swir2)/(dataset.swir1 + dataset.swir2)



In [ ]:

    
plot_xarray( NBR2(dataset_composite) )



In [ ]:

    
jitter_plot( NBR2(dataset_composite), name = "NBR2")

CCCI



In [ ]:

    
def CCCI(dataset):
    return ((dataset.nir - dataset.red)/(dataset.nir + dataset.red)).rename("CCCI")



In [ ]:

    
plot_xarray( CCCI(dataset_composite) )



In [ ]:

    
jitter_plot( CCCI(dataset_composite), name = "CCCI")

CVI



In [ ]:

    
def CVI(dataset):
    return (dataset.nir * (dataset.red / (dataset.green * dataset.green))).rename("CVI")



In [ ]:

    
plot_xarray( CVI(dataset_composite) )



In [ ]:

    
jitter_plot( CVI(dataset_composite), name = "CVI")

PNDVI



In [ ]:

    
def PNDVI(dataset):
    
    nir = dataset.nir
    green = dataset.green
    blue = dataset.blue
    red = dataset.red
    
    return ((nir - (green + red + blue))/(nir + (green + red + blue))).rename("PNDVI")



In [ ]:

    
plot_xarray( PNDVI(dataset_composite) )



In [ ]:

    
jitter_plot( PNDVI(dataset_composite), name = "PNDVI")

Spectral Indices



In [ ]:

    
for key in dataset_composite.data_vars:
    jitter_plot( dataset_composite[key], name = key)
    plt.show()

Normalize and merge features into a single data-set



In [ ]:

    
import xarray as xr  
def normalize( da :xr.DataArray) -> xr.Dataset:
    return (da-da.min())/(da.max() - da.min())



In [ ]:

    
pndvi  = PNDVI(  dataset_composite).to_dataset(name = "PNDVI")
ndbi   = NDBI(   dataset_composite).to_dataset(name = "NDBI")
nbr    = NBR(    dataset_composite).to_dataset(name = "NBR")
nbr2   = NBR2(   dataset_composite).to_dataset(name = "NBR2")
cvi    = CVI(    dataset_composite).to_dataset(name = "CVI")
ccci   = CCCI(   dataset_composite).to_dataset(name = "CCCI")
sci    = SCI(    dataset_composite).to_dataset(name = "SCI")
ndwi1  = NDWI_1( dataset_composite).to_dataset(name = "NDWI1")
ndwi2  = NDWI_2( dataset_composite).to_dataset(name = "NDWI2")
ndvi   = NDVI(   dataset_composite).to_dataset(name = "NDVI")
cov_ndvi = ndvi_cov.to_dataset(name = "NDVI_Covariance")
features = xr.merge([normalize(pndvi),
                     normalize(cvi),
                     normalize(nbr),
                     normalize(nbr2),
                     normalize(ndbi),
                     normalize(ccci),
                     normalize(sci),
                     normalize(ndwi1),
                     normalize(ndwi2),
                     normalize(ndvi),
                     cov_ndvi,
                     dataset_composite_normalalized]
                   )



In [ ]:

    
features



In [ ]:

    
features.red.size

Build Training Data set

This section involves formatting the data for random forest classifier training. The approach is not well polished and may un-necessarily complicate the process.



In [ ]:

    
landcover_names = list(df_new.groupby("LandUse").size().index)



In [ ]:

    
training_dict = {name: [] for name in landcover_names}
training_dict



In [ ]:

    
pandas_rows = [x[1] for x in df_new.iterrows()]



In [ ]:

    
# Example of first 2 elements
pandas_rows[:2]



In [ ]:

    
def xarray_pixel_to_array(_dataset):
    return [float(_dataset[variable_name].values) for variable_name in _dataset.data_vars]

for point_data in pandas_rows:
    training_dict[point_data["LandUse"]].append(
        xarray_pixel_to_array(
            features.sel(latitude = point_data["Latitude"],
                         longitude = point_data["Longitude"],
                         method = 'nearest')))



In [ ]:

    
data_list = []
target_list = []
ordered_class_names = []
for i, name in enumerate(training_dict.keys()):
    ordered_class_names.append(name)
    data_list += training_dict[name]
    target_list += [i]*len(training_dict[name])

k-fold cross validation split



In [ ]:

    
percentage_split = .33
random_number_seed = 44



In [ ]:

    
from sklearn.model_selection import train_test_split

in_train, in_test, out_train, out_test = train_test_split(data_list,
                                                          target_list,
                                                          test_size = percentage_split,
                                                          random_state = random_number_seed)

Examine training-data distribution



In [ ]:

    
plt.figure(figsize = (15,3))
plt.hist(out_train)

Examine testing-data distribution



In [ ]:

    
plt.figure(figsize = (15,3))
plt.hist(out_test)

training a random forest classifier



In [ ]:

    
from sklearn.ensemble import RandomForestClassifier



In [ ]:

    
clf = RandomForestClassifier(max_depth=10,
                             n_estimators = 1000,
                             random_state=random_number_seed)



In [ ]:

    
clf.fit(in_train, out_train)



In [ ]:

    
clf.score(in_test,
          out_test,
          sample_weight=None)

linear search for optimum depth



In [ ]:

    
max_depth = 40
random_number_seed = 43



In [ ]:

    
from time import time 

hyper_param_list = []
for i in range(2, max_depth):
    t1 = time()
    clf = RandomForestClassifier(max_depth=i,
                                 n_estimators = 1000,
                                 random_state=random_number_seed)
    clf.fit(in_train, out_train)
    t2 = time()
    hyper_param_list.append( [i,clf.score(in_test, out_test)])



In [ ]:

    
x, y = zip(*hyper_param_list)



In [ ]:

    
plt.figure(figsize = (15,5))
plt.plot(x,y)



In [ ]:

    
max(hyper_param_list, key = lambda x: x[1])



In [ ]:

    
optimal_depth = max(hyper_param_list, key = lambda x: x[1])[0]

Validation

Rebuild Model



In [ ]:

    
clf = RandomForestClassifier(max_depth=18,
                                 n_estimators = 1000,
                                 random_state=random_number_seed)

clf.fit(in_train, out_train)

Confusion Matrix



In [ ]:

    
from sklearn.metrics import confusion_matrix
y_true = out_test
y_pred = clf.predict(in_test)

cm = confusion_matrix(y_true, y_pred)



In [ ]:

    
cm

F1 Score



In [ ]:

    
y_true = np.array(out_test)
y_pred = np.array(clf.predict(in_test))



In [ ]:

    
def f1(tp,tn,fp,fn):
    _precision = precision(tp, fp)
    _recall    = recall(tp,fn)
    return 2*(_recall * _precision) / (_recall + _precision)
    
def precision(tp, fp):
    return tp/(tp + fp)

def recall(tp, fn):
    return tp / (tp + fn)

def evaluate(_x,_y, class_name):
    x = np.array(_x)
    y = np.array(_y)
    
    tp_mask = np.logical_and((x == class_name) , (y == class_name))
    fp_mask = np.logical_and((x == class_name) , (y != class_name))
    
    tn_mask = np.logical_and((x != class_name) , (y != class_name))
    fn_mask = np.logical_and((x != class_name) , (y == class_name))
    
    tp = sum(tp_mask)
    fp = sum(fp_mask)
    
    tn = sum(tn_mask)
    fn = sum(fn_mask)
    
    return tp, fp, tn, fn



In [ ]:

    
for index, label in enumerate(ordered_class_names):
    tp, fp, tn, fn = evaluate(y_pred, y_true, index)
    print("F1 score {} :      {}".format(label, f1(tp,tn,fp,fn)))

Plotting the Confusion Matrix



In [ ]:

    
import matplotlib.pyplot as plt
import numpy as np
import itertools

def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    title = title if normalize == False else title + " (Normalized)"
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.125 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="red" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))

    plt.show()



In [ ]:

    
plot_confusion_matrix(cm, ordered_class_names, title='Uruguay Confusion Matrix', cmap=plt.cm.Blues, normalize=False)



In [ ]:

    
plot_confusion_matrix(cm, ordered_class_names, title='Uruguay Confusion Matrix', cmap=plt.cm.Blues, normalize=True)

Classifier Sensitivity



In [ ]:

    
def sensitivity(_cm, labels):
    _cm = np.array(_cm).astype(np.float64)
    totals = np.sum(_cm, axis = 0)
    diag_percentage = [cm[i][i]/total for i,total in enumerate(totals)]
    replace_nans = lambda x: 0 if np.isnan(x) else x  
    diag_percentage = map(replace_nans, diag_percentage)
    return { a:b for a,b in zip(labels, diag_percentage) }



In [ ]:

    
sensitivity(cm, ordered_class_names)

Classifier Precision



In [ ]:

    
def precision(_cm, labels):
    _cm = np.array(_cm).astype(np.float64)
    totals = np.sum(_cm, axis = 1)
    diag_percentage = [cm[i][i]/total for i,total in enumerate(totals)]
    replace_nans = lambda x: 0 if np.isnan(x) else x  
    diag_percentage = map(replace_nans, diag_percentage)
    return { a:b for a,b in zip(labels, diag_percentage) }



In [ ]:

    
precision(cm, ordered_class_names)



In [ ]:



In [ ]: