In [1]:
#Add all dependencies to PYTHON_PATH
import sys
sys.path.append("/usr/lib/spark/python")
sys.path.append("/usr/lib/spark/python/lib/py4j-0.10.4-src.zip")
sys.path.append("/usr/lib/python3/dist-packages")
sys.path.append("/data/local/jupyterhub/modules/python")
#Define environment variables
import os
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython"
import subprocess
#Load PySpark to connect to a Spark cluster
from pyspark import SparkConf, SparkContext
from hdfs import InsecureClient
from tempfile import TemporaryFile
#from osgeo import gdal
#To read GeoTiffs as a ByteArray
from io import BytesIO
from rasterio.io import MemoryFile
import numpy
import numpy as np
import pandas
import datetime
import matplotlib.pyplot as plt
import rasterio
from rasterio import plot
from os import listdir
from os.path import isfile, join
import scipy.linalg
In [2]:
debugMode = True
In [3]:
appName = "plot_GeoTiff"
masterURL = "spark://emma0.emma.nlesc.nl:7077"
#A context needs to be created if it does not already exist
try:
sc.stop()
except NameError:
print("A new Spark Context will be created.")
sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))
conf = sc.getConf()
In [4]:
def getModeAsArray(filePath):
data = sc.binaryFiles(filePath).take(1)
byteArray = bytearray(data[0][1])
memfile = MemoryFile(byteArray)
dataset = memfile.open()
array = np.array(dataset.read()[0], dtype=np.float64)
memfile.close()
array = array.flatten()
array = array[~np.isnan(array)]
return array
In [5]:
def detemineNorm(array1, array2):
if array1.shape != array2.shape:
print("Error: shapes are not the same: (" + str(array1.shape) + " vs " + str(array2.shape) + ")")
return 0
value = scipy.linalg.norm(array1 - array2)
if value > 1:
value = scipy.linalg.norm(array1 + array2)
return value
In [6]:
textFile1 = sc.textFile("hdfs:///user/emma/svd/spark/BloomGridmetLeafGridmetCali3/U.csv").map(lambda line: (line.split(','))).map(lambda m: [ float(i) for i in m]).collect()
In [7]:
array1 = numpy.array(textFile1, dtype=np.float64)
vector11 = array1.T[0]
vector12 = array1.T[1]
vector13 = array1.T[2]
In [8]:
textFile2 = sc.textFile("hdfs:///user/emma/svd/BloomGridmetLeafGridmetCali/U.csv").map(lambda line: (line.split(','))).map(lambda m: [ np.float64(i) for i in m]).collect()
In [9]:
array2 = numpy.array(textFile2, dtype=np.float64).reshape(37,23926)
vector21 = array2[0]
vector22 = array2[1]
vector23 = array2[2]
In [10]:
array2.shape
Out[10]:
In [11]:
print(detemineNorm(vector11, vector21))
print(detemineNorm(vector12, vector22))
print(detemineNorm(vector13, vector23))
In [12]:
array1 = getModeAsArray("hdfs:///user/emma/svd/spark/BloomGridmetLeafGridmetCali3/u_tiffs/svd_u_0_26.tif")
array2 = getModeAsArray("hdfs:///user/emma/svd/BloomGridmetLeafGridmetCali/ModeU01.tif")
detemineNorm(array1, array2)
Out[12]:
In [13]:
print(detemineNorm(array1, vector11))
print(detemineNorm(array1, vector21))
print(detemineNorm(array2, vector11))
print(detemineNorm(array2, vector21))
In [14]:
~np.in1d(array1, vector21)
Out[14]:
In [15]:
for i in range(10):
print("%.19f %0.19f %0.19f" % (array1[i], array2[i], (array1[i]+array2[i])))
In [16]:
array1 = getModeAsArray("hdfs:///user/emma/svd/BloomFinalLowPRLeafFinalLowPR/ModeU01.tif")
array2 = getModeAsArray("hdfs:///user/emma/svd/spark/BloomFinalLowPRLeafFinalLowPR3/u_tiffs/svd_u_0_3.tif")
detemineNorm(array1, array2)
Out[16]:
In [17]:
array1 = getModeAsArray("hdfs:///user/emma/svd/BloomFinalLowPRLeafFinalLowPR/ModeU02.tif")
array2 = getModeAsArray("hdfs:///user/emma/svd/spark/BloomFinalLowPRLeafFinalLowPR3/u_tiffs/svd_u_1_3.tif")
detemineNorm(array1, array2)
Out[17]:
In [18]:
array1 = getModeAsArray("hdfs:///user/emma/svd/BloomFinalLowPRLeafFinalLowPR/ModeU01.tif")
array2 = getModeAsArray("hdfs:///user/emma/svd/spark/BloomFinalLowPRLeafFinalLowPR3/u_tiffs/svd_u_0_3.tif")
detemineNorm(array1, array2)
Out[18]:
In [36]:
for i in range(37):
if (i < 9):
path1 = "hdfs:///user/emma/svd/BloomGridmetLeafGridmetCali/ModeU0"+ str(i+1) + ".tif"
else:
path1 = "hdfs:///user/emma/svd/BloomGridmetLeafGridmetCali/ModeU"+ str(i+1) + ".tif"
array1 = getModeAsArray(path1)
array2 = getModeAsArray("hdfs:///user/emma/svd/spark/BloomGridmetLeafGridmetCali3/u_tiffs/svd_u_" +str(i) +"_26.tif")
print(detemineNorm(array1, array2))
End of Notebook
In [ ]: