In this NoteBook the reader finds code to read GeoTiff files, single- or multi-band, from HDFS. The notebook will read GeoTiffs for two phenology products: phenology model and phenology remote sensing.
The GeoTiffs are read as a ByteArray and then stored in memory using MemFile from RasterIO python package. The same package is then used to plot a GeoTiff's band, or multiple bands using sub-plots, histograms, etc.
With this example the user can load GeoTiffs from HDFS and then explore all the features of Python packages such as rasterio. It also show how to run kmeans from sklearn. The kmeans part in this notebook is inpired in Unsupervised classification of imagery using scikit-learn.
In [2]:
#Add all dependencies to PYTHON_PATH
import sys
sys.path.append("/usr/lib/spark/python")
sys.path.append("/usr/lib/spark/python/lib/py4j-0.10.4-src.zip")
sys.path.append("/usr/lib/python3/dist-packages")
#Define environment variables
import os
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython"
#Load PySpark to connect to a Spark cluster
from pyspark import SparkConf, SparkContext
#from osgeo import gdal
#To read GeoTiffs as a ByteArray
from io import BytesIO
import rasterio
from rasterio.io import MemoryFile
import numpy as np
import matplotlib.pyplot as plt
import sklearn.cluster
In [3]:
appName = "plot_GeoTiff"
masterURL="spark://emma0.emma.nlesc.nl:7077"
#A context needs to be created if it does not already exist
try:
sc.stop()
except NameError:
print("A new Spark Context will be created.")
sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))
In [ ]:
data_dir = "hdfs:///user/hadoop/avhrr/SOST/"
start_year = 1989
end_year = 2014
years = list(range(start_year, end_year+1))
file_paths = []
sat_data = np.ndarray(shape=(30388736, len(years)), dtype=float , order='C')
for f in range (len(years)):
file_path = data_dir + "/av_SOST" + str(years[f]) + "v4_SIx.tif"
print(file_path)
data = sc.binaryFiles(file_path).take(1)
dataByteArray = bytearray(data[0][1])
memfile = MemoryFile(dataByteArray)
data_set = memfile.open()
# Convert to NumPy array the 1st band
data_array = np.asarray(data_set.read())[0]
# Replace NaN
#data_array[np.isnan(data_array)] = -1
#Remove NaN
data_array = data_array[~np.isnan(data_array)]
sat_data[:,f] = data_array
In [ ]:
data_dir = "hdfs:///user/hadoop/spring-index/BloomFinal"
start_year = 1989
end_year = 2014
years = list(range(start_year, end_year+1))
file_paths = []
mod_data = np.ndarray(shape=(30388736, len(years)), dtype=float , order='C')
for f in range (len(years)):
file_path = data_dir + "/" + str(years[f]) + ".tif"
print(file_path)
data = sc.binaryFiles(file_path).take(1)
dataByteArray = bytearray(data[0][1])
memfile = MemoryFile(dataByteArray)
data_set = memfile.open()
# Convert to NumPy array the 1st band
data_array = np.asarray(data_set.read())[0]
# Replace NaN
#data_array[np.isnan(data_array)] = -1
#Remove NaN
data_array = data_array[~np.isnan(data_array)]
mod_data[:,f] = data_array
In [4]: