In [1]:
import os
In [2]:
from pyspark import SparkContext
In [3]:
SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])
In [4]:
sc = SparkContext(os.environ.get("CLUSTER_URL"), 'pyspark-demo')
In [5]:
from pyspark.sql import SQLContext
In [6]:
sqlContext = SQLContext(sc)
In [7]:
!wget http://files.figshare.com/1315364/iris.json
In [8]:
!hadoop fs -put iris.json /tmp
In [9]:
!hadoop fs -lsr /tmp
In [10]:
iris = sqlContext.read.load('hdfs://54.159.244.205:8020/tmp/iris.json', 'json')
In [11]:
iris.show()
In [12]:
iris.printSchema()
In [14]:
iris.filter(iris.petalLength < 1.2).show()
In [15]:
iris.groupBy("species").count().show()
In [ ]: