In [1]:
#Add all dependencies to PYTHON_PATH
import sys
sys.path.append("/usr/lib/spark/python")
sys.path.append("/usr/lib/spark/python/lib/py4j-0.10.4-src.zip")
sys.path.append("/usr/lib/python3/dist-packages")
#Define environment variables
import os
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython"
#Load PySpark to connect to a Spark cluster
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
In [2]:
appName = "kmeans_wssse"
masterURL="spark://pheno0.phenovari-utwente.surf-hosted.nl:7077"
#A context needs to be created if it does not already exist
try:
sc.stop()
except NameError:
print("A new Spark Context will be created.")
sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))
sqlContext = SQLContext(sc)
spark = SparkSession.builder \
.master(masterURL) \
.appName(appName) \
.getOrCreate;
#OR
#spark = SparkSession.builder.config(conf)
In [3]:
#offline_dir_path = "hdfs:///user/pheno/avhrr/"
offline_dir_path = "hdfs:///user/pheno/spring-index/"
#geoTiff_dir = "SOST"
geoTiff_dir = "BloomFinal"
wssse_csv_path = offline_dir_path + geoTiff_dir + "/wssse.csv"
csvDf = sqlContext.read.format("csv").option("header", "false").option("inferschema", "true").option("mode", "DROPMALFORMED").load(wssse_csv_path)
df = csvDf.toPandas()
In [4]:
res = df[['_c0', '_c2']].as_matrix()
In [5]:
%matplotlib notebook
plt.plot(res[:,0],res[:,1])
plt.show()
In [6]:
sc.stop()