In [1]:
import sys
sys.path.append("/usr/lib/spark/python")
sys.path.append("/usr/lib/spark/python/lib/py4j-0.10.4-src.zip")
sys.path.append("/usr/lib/python3/dist-packages")
import os
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"
import os
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython"
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark import SparkConf, SparkContext
from osgeo import gdal
from io import BytesIO
import matplotlib.pyplot as plt
import rasterio
from rasterio import plot
from rasterio.io import MemoryFile
In [2]:
appName = "kmeans_mlib_hdfs"
masterURL="spark://pheno0.phenovari-utwente.surf-hosted.nl:7077"
try:
sc.stop()
except NameError:
print("A new Spark Context will be created.")
sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))
In [15]:
file_path = "hdfs:///user/hadoop/spring-index/LastFreeze/1980.tif"
clusters_path = "hdfs:///user/pheno/spring-index/BloomFinal/clusters_5_35.tif"
file_data = sc.binaryFiles(file_path).take(1)
file_dataByteArray = bytearray(file_data[0][1])
clusters_data = sc.binaryFiles(clusters_path).take(1)
clusters_dataByteArray = bytearray(clusters_data[0][1])
In [16]:
file_dataset = MemoryFile(file_dataByteArray).open()
print(file_dataset.profile)
clusters_dataset = MemoryFile(clusters_dataByteArray).open()
print(clusters_dataset.profile)
In [17]:
plot.show((file_dataset,1))
Out[17]:
In [18]:
plot.show((clusters_dataset,1))
Out[18]:
In [ ]: