In [ ]:
from sys import path
path.append("../")
%matplotlib inline
In [ ]:
import pandas as pd
In [ ]:
df = pd.read_csv('data.csv')
df.head()
In [ ]:
df.groupby("LandUse").size()
In [ ]:
from matplotlib import pyplot
import seaborn as sns
fig, ax = pyplot.subplots(figsize=(15,5))
sns.countplot(x="LandUse",data=df, palette="Greens_d");
ForestryFruittreesNativeforestStubbleWaterMy suggestion is reducing features/label-set to the following:
Natural Grassland Summercrops Prairie as well as a grouping of Forestry, FruitTrees, and Native Forest under:
Forest then grouping other labels under a class called:
Misc
In [ ]:
def combine_classes_on_dataframe(_df, classes = [], combine_to = "default", column_name = "LandUse"):
df_new = df.copy()
df_new[column_name].update(df_new[column_name].map(lambda x: combine_to if x in classes else x ))
return df_new
In [ ]:
df_new = df.copy()
df_new['LandUse'].update(df_new['LandUse'].map(lambda x: "Forest" if x in ["Forestry","Fruittrees","Nativeforest"] else x ))
df_new['LandUse'].update(df_new['LandUse'].map(lambda x: "Misc" if x not in ["Forest","Prairie","Summercrops","Naturalgrassland"] else x ))
In [ ]:
df_new.groupby("LandUse").size()
In [ ]:
fig, ax = pyplot.subplots(figsize=(15,5))
sns.countplot(x="LandUse",data=df_new, palette="Greens_d");
In [ ]:
import datacube
dc = datacube.Datacube(config = "/home/localuser/.datacube.conf")
LIsting products with 'uruguay' in its name
In [ ]:
dc.list_products()[dc.list_products()['name'].str.contains("uruguay")]
Load the data
In [ ]:
params = dict(platform = 'LANDSAT_8',
product = 'ls8_lasrc_uruguay',
latitude = (-34.44988376, -34.096445),
longitude = (-56.29119062, -55.24653668),
time = ('2016-01-01', '2017-01-01'),
measurements = ['red', 'green', 'blue', 'nir', 'swir1', 'swir2', 'pixel_qa'])
In [ ]:
dataset = dc.load(**params)
Composite Data
In [ ]:
import numpy as np
import datetime
from utils.data_cube_utilities.dc_mosaic import ls8_unpack_qa
from utils.data_cube_utilities.dc_mosaic import create_median_mosaic
def clean_mask_ls8(ds):
water_mask = ls8_unpack_qa(ds.pixel_qa, cover_type = "water")
clear_mask = ls8_unpack_qa(ds.pixel_qa, cover_type = "clear")
clean_mask = np.logical_or(water_mask, clear_mask)
return clean_mask
def compress_on_time(_ds, date_range):
subset = _ds.sel(time = slice(*date_range)) # select a smaller date-range
water_mask = ls8_unpack_qa(subset.pixel_qa, cover_type = "water")
clear_mask = ls8_unpack_qa(subset.pixel_qa, cover_type = "clear")
clean_mask = np.logical_or(water_mask, clear_mask)
return create_median_mosaic(subset, clean_mask = clean_mask)
In [ ]:
dataset_composite = compress_on_time(dataset, ('2016-01-01', '2017-01-01'))
In [ ]:
dataset_composite
Normalization
In [ ]:
dataset_composite = dataset_composite.drop('pixel_qa')
In [ ]:
dataset_composite_normalalized = dataset_composite.copy(deep = True)
for band_name in dataset_composite_normalalized.data_vars:
arr = dataset_composite_normalalized[band_name].values
arr[arr > 10000] = 10000
arr[arr < 0] = 10000
In [ ]:
dataset_composite_normalalized = dataset_composite_normalalized * (1/10000)
In [ ]:
dataset_composite_normalalized
The default features for a Landsat based classifier are:
redgreenbluenirswir1swir1 High order relationships between these features can also be used inputs to a random forest classifier. The following indices are already used in Landsat analysis and correlate with the existence of several land classes like water, chlorophyll, sediment/top-soil:
NDVINDVI CovarianceNDWINDBISCICCCICVIPNDVIThese are not the best features to include but are easy to source. The code below should serve as a template of adding new features to the random forest feature classifier.
In [ ]:
import matplotlib.pyplot as plt
def plot_xarray(data_array):
fig = plt.figure(figsize = (15,5))
plt.axis('equal')
data_array.plot()
In [ ]:
def select_pixels_from_dataarray_using_pandas_array(ds,
pandas_array = None,
latitude_name = None,
longitude_name = None,
land_cover_name = None):
landcover_names = list(pandas_array.groupby(land_cover_name).size().index)
training_dict = {name: [] for name in landcover_names}
pandas_rows = [x[1] for x in pandas_array.iterrows()]
for point_data in pandas_rows:
training_dict[point_data[land_cover_name]].append(
float(ds.sel(latitude = point_data[latitude_name],
longitude = point_data[longitude_name],
method = 'nearest')))
data_list = []
target_list = []
ordered_class_names = []
for i, name in enumerate(training_dict.keys()):
ordered_class_names.append(name)
data_list += training_dict[name]
target_list += [i]*len(training_dict[name])
classes = list(map(lambda class_name_index: ordered_class_names[class_name_index], target_list))
return pd.DataFrame({"data": data_list,
"landcover": classes})
from functools import partial
select_pixels = partial(select_pixels_from_dataarray_using_pandas_array,
pandas_array = df_new,
latitude_name = "Latitude",
longitude_name = "Longitude",
land_cover_name = "LandUse")
In [ ]:
import seaborn as sns
def jitter_plot(ds, name):
data = select_pixels(ds)
data = data.rename(index = str, columns={"data":name})
plt.figure(figsize = (15,5))
ax = sns.stripplot(x=name, y="landcover", data=data , jitter=True, linewidth=1)
NDVI
In [ ]:
def NDVI(dataset):
return (dataset.nir - dataset.red)/(dataset.nir + dataset.red).rename("NDVI")
In [ ]:
plot_xarray( NDVI(dataset_composite) )
In [ ]:
jitter_plot(NDVI(dataset_composite), name = "NDVI")
NDVI Covariance
In [ ]:
import xarray as xr
def covariance(da:xr.DataArray):
return da.std(dim = "time")/da.mean(dim = "time")
In [ ]:
def covariance_NDVI(ds, mask = None):
## TODO: Lift masking out of function
# mask = np.ones(ds[list(ds.data_vars.keys())[0]].values.shape).astype(bool) if mask is None else mask
ds_ndvi = NDVI(ds)
ds_ndvi = ds_ndvi.where(mask)
return covariance(ds_ndvi)
In [ ]:
import numpy as np
def finite_histogram(data_array, *args, **kwargs):
x = data_array.values.copy()
x = x[~np.isnan(x)]
x = x[np.isfinite(x)]
data = plt.hist(x,*args, **kwargs)
In [ ]:
ndvi_cov = covariance_NDVI(dataset,
mask = clean_mask_ls8(dataset))
In [ ]:
fig = plt.figure(figsize = (15,5))
plt.imshow(ndvi_cov.values,
vmin=0,
vmax=1)
In [ ]:
plt.figure(figsize = (15,5))
finite_histogram(ndvi_cov, bins = 2000, range = (0,1))
In [ ]:
jitter_plot(ndvi_cov, name = "coefficient_of_variance_NDVI")
NBR
In [ ]:
def NBR(dataset):
return ((dataset.nir - dataset.swir2) / (dataset.swir2 + dataset.nir)).rename("NBR")
In [ ]:
plot_xarray( NBR(dataset_composite) )
In [ ]:
jitter_plot( NBR(dataset_composite), name = "NBR")
NDBI
In [ ]:
def NDBI(dataset):
return (( dataset.swir1 - dataset.nir ) / (dataset.swir1 + dataset.nir)).rename("NDBI")
In [ ]:
plot_xarray( NDBI(dataset_composite) )
In [ ]:
jitter_plot( NDBI(dataset_composite), name = "NDBI")
NDWI
In [ ]:
def NDWI_1(dataset):
return (dataset.nir - dataset.swir1)/(dataset.nir + dataset.swir1).rename("NDWI_1")
In [ ]:
plot_xarray( NDWI_1(dataset_composite) )
In [ ]:
jitter_plot( NDWI_1(dataset_composite), name = "NDWI_1")
In [ ]:
def NDWI_2(dataset):
return (dataset.green - dataset.nir)/(dataset.green + dataset.nir).rename("NDWI_2")
In [ ]:
plot_xarray( NDWI_2(dataset_composite) )
In [ ]:
jitter_plot( NDWI_2(dataset_composite), name = "NDWI_2")
SCI
In [ ]:
def SCI(dataset):
return ((dataset.swir1 - dataset.nir)/(dataset.swir1 + dataset.nir)).rename("SCI")
In [ ]:
plot_xarray( SCI(dataset_composite) )
In [ ]:
jitter_plot( SCI(dataset_composite), name = "SCI")
NBR2
In [ ]:
def NBR2(dataset):
return (dataset.swir1 - dataset.swir2)/(dataset.swir1 + dataset.swir2)
In [ ]:
plot_xarray( NBR2(dataset_composite) )
In [ ]:
jitter_plot( NBR2(dataset_composite), name = "NBR2")
CCCI
In [ ]:
def CCCI(dataset):
return ((dataset.nir - dataset.red)/(dataset.nir + dataset.red)).rename("CCCI")
In [ ]:
plot_xarray( CCCI(dataset_composite) )
In [ ]:
jitter_plot( CCCI(dataset_composite), name = "CCCI")
CVI
In [ ]:
def CVI(dataset):
return (dataset.nir * (dataset.red / (dataset.green * dataset.green))).rename("CVI")
In [ ]:
plot_xarray( CVI(dataset_composite) )
In [ ]:
jitter_plot( CVI(dataset_composite), name = "CVI")
PNDVI
In [ ]:
def PNDVI(dataset):
nir = dataset.nir
green = dataset.green
blue = dataset.blue
red = dataset.red
return ((nir - (green + red + blue))/(nir + (green + red + blue))).rename("PNDVI")
In [ ]:
plot_xarray( PNDVI(dataset_composite) )
In [ ]:
jitter_plot( PNDVI(dataset_composite), name = "PNDVI")
In [ ]:
for key in dataset_composite.data_vars:
jitter_plot( dataset_composite[key], name = key)
plt.show()
Normalize and merge features into a single data-set
In [ ]:
import xarray as xr
def normalize( da :xr.DataArray) -> xr.Dataset:
return (da-da.min())/(da.max() - da.min())
In [ ]:
pndvi = PNDVI( dataset_composite).to_dataset(name = "PNDVI")
ndbi = NDBI( dataset_composite).to_dataset(name = "NDBI")
nbr = NBR( dataset_composite).to_dataset(name = "NBR")
nbr2 = NBR2( dataset_composite).to_dataset(name = "NBR2")
cvi = CVI( dataset_composite).to_dataset(name = "CVI")
ccci = CCCI( dataset_composite).to_dataset(name = "CCCI")
sci = SCI( dataset_composite).to_dataset(name = "SCI")
ndwi1 = NDWI_1( dataset_composite).to_dataset(name = "NDWI1")
ndwi2 = NDWI_2( dataset_composite).to_dataset(name = "NDWI2")
ndvi = NDVI( dataset_composite).to_dataset(name = "NDVI")
cov_ndvi = ndvi_cov.to_dataset(name = "NDVI_Covariance")
features = xr.merge([normalize(pndvi),
normalize(cvi),
normalize(nbr),
normalize(nbr2),
normalize(ndbi),
normalize(ccci),
normalize(sci),
normalize(ndwi1),
normalize(ndwi2),
normalize(ndvi),
cov_ndvi,
dataset_composite_normalalized]
)
In [ ]:
features
In [ ]:
features.red.size
This section involves formatting the data for random forest classifier training. The approach is not well polished and may un-necessarily complicate the process.
In [ ]:
landcover_names = list(df_new.groupby("LandUse").size().index)
In [ ]:
training_dict = {name: [] for name in landcover_names}
training_dict
In [ ]:
pandas_rows = [x[1] for x in df_new.iterrows()]
In [ ]:
# Example of first 2 elements
pandas_rows[:2]
In [ ]:
def xarray_pixel_to_array(_dataset):
return [float(_dataset[variable_name].values) for variable_name in _dataset.data_vars]
for point_data in pandas_rows:
training_dict[point_data["LandUse"]].append(
xarray_pixel_to_array(
features.sel(latitude = point_data["Latitude"],
longitude = point_data["Longitude"],
method = 'nearest')))
In [ ]:
data_list = []
target_list = []
ordered_class_names = []
for i, name in enumerate(training_dict.keys()):
ordered_class_names.append(name)
data_list += training_dict[name]
target_list += [i]*len(training_dict[name])
In [ ]:
percentage_split = .33
random_number_seed = 44
In [ ]:
from sklearn.model_selection import train_test_split
in_train, in_test, out_train, out_test = train_test_split(data_list,
target_list,
test_size = percentage_split,
random_state = random_number_seed)
Examine training-data distribution
In [ ]:
plt.figure(figsize = (15,3))
plt.hist(out_train)
Examine testing-data distribution
In [ ]:
plt.figure(figsize = (15,3))
plt.hist(out_test)
In [ ]:
from sklearn.ensemble import RandomForestClassifier
In [ ]:
clf = RandomForestClassifier(max_depth=10,
n_estimators = 1000,
random_state=random_number_seed)
In [ ]:
clf.fit(in_train, out_train)
In [ ]:
clf.score(in_test,
out_test,
sample_weight=None)
In [ ]:
max_depth = 40
random_number_seed = 43
In [ ]:
from time import time
hyper_param_list = []
for i in range(2, max_depth):
t1 = time()
clf = RandomForestClassifier(max_depth=i,
n_estimators = 1000,
random_state=random_number_seed)
clf.fit(in_train, out_train)
t2 = time()
hyper_param_list.append( [i,clf.score(in_test, out_test)])
In [ ]:
x, y = zip(*hyper_param_list)
In [ ]:
plt.figure(figsize = (15,5))
plt.plot(x,y)
In [ ]:
max(hyper_param_list, key = lambda x: x[1])
In [ ]:
optimal_depth = max(hyper_param_list, key = lambda x: x[1])[0]
Rebuild Model
In [ ]:
clf = RandomForestClassifier(max_depth=18,
n_estimators = 1000,
random_state=random_number_seed)
clf.fit(in_train, out_train)
Confusion Matrix
In [ ]:
from sklearn.metrics import confusion_matrix
y_true = out_test
y_pred = clf.predict(in_test)
cm = confusion_matrix(y_true, y_pred)
In [ ]:
cm
F1 Score
In [ ]:
y_true = np.array(out_test)
y_pred = np.array(clf.predict(in_test))
In [ ]:
def f1(tp,tn,fp,fn):
_precision = precision(tp, fp)
_recall = recall(tp,fn)
return 2*(_recall * _precision) / (_recall + _precision)
def precision(tp, fp):
return tp/(tp + fp)
def recall(tp, fn):
return tp / (tp + fn)
def evaluate(_x,_y, class_name):
x = np.array(_x)
y = np.array(_y)
tp_mask = np.logical_and((x == class_name) , (y == class_name))
fp_mask = np.logical_and((x == class_name) , (y != class_name))
tn_mask = np.logical_and((x != class_name) , (y != class_name))
fn_mask = np.logical_and((x != class_name) , (y == class_name))
tp = sum(tp_mask)
fp = sum(fp_mask)
tn = sum(tn_mask)
fn = sum(fn_mask)
return tp, fp, tn, fn
In [ ]:
for index, label in enumerate(ordered_class_names):
tp, fp, tn, fn = evaluate(y_pred, y_true, index)
print("F1 score {} : {}".format(label, f1(tp,tn,fp,fn)))
Plotting the Confusion Matrix
In [ ]:
import matplotlib.pyplot as plt
import numpy as np
import itertools
def plot_confusion_matrix(cm,
target_names,
title='Confusion matrix',
cmap=None,
normalize=True):
"""
Citiation
---------
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""
accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy
if cmap is None:
cmap = plt.get_cmap('Blues')
plt.figure(figsize=(10, 8))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
title = title if normalize == False else title + " (Normalized)"
plt.title(title)
plt.colorbar()
if target_names is not None:
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 1.125 if normalize else cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
if normalize:
plt.text(j, i, "{:0.2f}".format(cm[i, j]),
horizontalalignment="center",
color="red" if cm[i, j] > thresh else "black")
else:
plt.text(j, i, "{:,}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
plt.show()
In [ ]:
plot_confusion_matrix(cm, ordered_class_names, title='Uruguay Confusion Matrix', cmap=plt.cm.Blues, normalize=False)
In [ ]:
plot_confusion_matrix(cm, ordered_class_names, title='Uruguay Confusion Matrix', cmap=plt.cm.Blues, normalize=True)
Classifier Sensitivity
In [ ]:
def sensitivity(_cm, labels):
_cm = np.array(_cm).astype(np.float64)
totals = np.sum(_cm, axis = 0)
diag_percentage = [cm[i][i]/total for i,total in enumerate(totals)]
replace_nans = lambda x: 0 if np.isnan(x) else x
diag_percentage = map(replace_nans, diag_percentage)
return { a:b for a,b in zip(labels, diag_percentage) }
In [ ]:
sensitivity(cm, ordered_class_names)
Classifier Precision
In [ ]:
def precision(_cm, labels):
_cm = np.array(_cm).astype(np.float64)
totals = np.sum(_cm, axis = 1)
diag_percentage = [cm[i][i]/total for i,total in enumerate(totals)]
replace_nans = lambda x: 0 if np.isnan(x) else x
diag_percentage = map(replace_nans, diag_percentage)
return { a:b for a,b in zip(labels, diag_percentage) }
In [ ]:
precision(cm, ordered_class_names)
In [ ]:
In [ ]: