Using landsat to detect coastline changes

In [37]:
# reading files 
import os
# matching filenames
import re
import logging

# reading tiffs
import osgeo.gdal
# computation
import numpy as np
# tables
import pandas as pd 

# segmentation
import skimage.segmentation

# plotting
import matplotlib.pyplot as plt
%matplotlib inline
Let's read in some landsat channels. These are already mapped in the same grid and have channel and date embedded in the name.

In [2]:
# read from this directory
datadir = '/Users/baart_f/data/landsat/zm'

# match the filename with this pattern
pattern = re.compile(r'^all\-(?P<date>\d+\-\d+\-\d+)\.(?P<channel>[\w\d]+)\.tif$')

# loop over all the files
files = []
for f in os.listdir(datadir):
    match = pattern.match(f)
    if match:
        record = match.groupdict()
        record['file'] = f
df = pd.DataFrame.from_records(files)
# ignore the temp and BQA channels
table = df[~np.in1d(, ('temp', 'BQA'))].pivot(index='channel', columns='date')
# these are the files we'll process

date 2013-07-19 2013-08-04 2013-08-13 2013-08-20 2013-09-05 2013-09-30 2013-10-07 2013-12-03 2013-12-10 2013-12-19 ... 2014-08-23 2014-09-01 2014-09-08 2014-09-17 2014-10-03 2014-10-10 2014-11-11 2014-12-06 2014-12-29 2015-01-07
blue ...
blue2 all-2013-07-19.blue2.tif all-2013-08-04.blue2.tif all-2013-08-13.blue2.tif all-2013-08-20.blue2.tif all-2013-09-05.blue2.tif all-2013-09-30.blue2.tif all-2013-10-07.blue2.tif all-2013-12-03.blue2.tif all-2013-12-10.blue2.tif all-2013-12-19.blue2.tif ... all-2014-08-23.blue2.tif all-2014-09-01.blue2.tif all-2014-09-08.blue2.tif all-2014-09-17.blue2.tif all-2014-10-03.blue2.tif all-2014-10-10.blue2.tif all-2014-11-11.blue2.tif all-2014-12-06.blue2.tif all-2014-12-29.blue2.tif all-2015-01-07.blue2.tif
green ...
nir all-2013-07-19.nir.tif all-2013-08-04.nir.tif all-2013-08-13.nir.tif all-2013-08-20.nir.tif all-2013-09-05.nir.tif all-2013-09-30.nir.tif all-2013-10-07.nir.tif all-2013-12-03.nir.tif all-2013-12-10.nir.tif all-2013-12-19.nir.tif ... all-2014-08-23.nir.tif all-2014-09-01.nir.tif all-2014-09-08.nir.tif all-2014-09-17.nir.tif all-2014-10-03.nir.tif all-2014-10-10.nir.tif all-2014-11-11.nir.tif all-2014-12-06.nir.tif all-2014-12-29.nir.tif all-2015-01-07.nir.tif
pan all-2013-07-19.pan.tif all-2013-08-04.pan.tif all-2013-08-13.pan.tif all-2013-08-20.pan.tif all-2013-09-05.pan.tif all-2013-09-30.pan.tif all-2013-10-07.pan.tif all-2013-12-03.pan.tif all-2013-12-10.pan.tif all-2013-12-19.pan.tif ... all-2014-08-23.pan.tif all-2014-09-01.pan.tif all-2014-09-08.pan.tif all-2014-09-17.pan.tif all-2014-10-03.pan.tif all-2014-10-10.pan.tif all-2014-11-11.pan.tif all-2014-12-06.pan.tif all-2014-12-29.pan.tif all-2015-01-07.pan.tif
red ...
swir1 all-2013-07-19.swir1.tif all-2013-08-04.swir1.tif all-2013-08-13.swir1.tif all-2013-08-20.swir1.tif all-2013-09-05.swir1.tif all-2013-09-30.swir1.tif all-2013-10-07.swir1.tif all-2013-12-03.swir1.tif all-2013-12-10.swir1.tif all-2013-12-19.swir1.tif ... all-2014-08-23.swir1.tif all-2014-09-01.swir1.tif all-2014-09-08.swir1.tif all-2014-09-17.swir1.tif all-2014-10-03.swir1.tif all-2014-10-10.swir1.tif all-2014-11-11.swir1.tif all-2014-12-06.swir1.tif all-2014-12-29.swir1.tif all-2015-01-07.swir1.tif
swir2 all-2013-07-19.swir2.tif all-2013-08-04.swir2.tif all-2013-08-13.swir2.tif all-2013-08-20.swir2.tif all-2013-09-05.swir2.tif all-2013-09-30.swir2.tif all-2013-10-07.swir2.tif all-2013-12-03.swir2.tif all-2013-12-10.swir2.tif all-2013-12-19.swir2.tif ... all-2014-08-23.swir2.tif all-2014-09-01.swir2.tif all-2014-09-08.swir2.tif all-2014-09-17.swir2.tif all-2014-10-03.swir2.tif all-2014-10-10.swir2.tif all-2014-11-11.swir2.tif all-2014-12-06.swir2.tif all-2014-12-29.swir2.tif all-2015-01-07.swir2.tif

8 rows × 35 columns

In [3]:
# recompute to 0-1
def normalize(arr):
    """normalize to 0-1.0"""
    arr = arr.astype('float64')
    return (arr - arr.min())/(arr.max() - arr.min())

# loop over all the bands, group by date and channel
allbands = []
for channel, row in table.iterrows():
    bands = []
    for date in table['file'].columns:
        filename = row['file'][date]
        ds = osgeo.gdal.Open(os.path.join(datadir, filename))
        band = ds.ReadAsArray(0)
    # add an extra channel to the end for the time     
# concatenate everything into 1 array    
data = np.concatenate(allbands, axis=-1)
# should be in the shape of space x time x channels 
# time is just another spatial dimension for the segmentation

(591, 971, 35, 8)

In [4]:
# segment all the images, it seems to convert floats back to ints, so we'll multiply with 255, check what's going on....
# We're using multichannels (n=8), no lab space as we don't have rgb
# n_segments should be quite large otherwise slic connectivity will force superpixels in first and last images together
# distances in time are 10x distance in space. TODO: compute variance in time and space and choose something more sophisticated. 
labels = skimage.segmentation.slic(
# we should have labels in space x time

(591, 971, 35)

In [11]:
# plot the segments
fig, axes = plt.subplots(7, 5, figsize=(20, 20))
colors = np.arange(labels.max())
# shuffle the colors so we see some patterns
# this should show a homogenous distribution over time and space
for i, ax in enumerate(axes.flat):
    ax.imshow(colors[labels[:,:,i]-1], cmap='Accent')