In [2]:
import datetime
from pytz import timezone
print "Last run @%s" % (datetime.datetime.now(timezone('US/Pacific')))
#
import sys
print "Python Version : %s" % (sys.version)
#
from pyspark.sql import SparkSession
print "Spark Version : %s" % (spark.version)
from pyspark.context import SparkContext
print "Spark Version : %s" % (sc.version)
#
from pyspark.conf import SparkConf
conf = SparkConf()
print conf.toDebugString()
In [3]:
data = xrange(1,101)
In [4]:
data_rdd = sc.parallelize(data)
In [5]:
data_rdd.take(3)
Out[5]:
In [6]:
# Make sure rdd works
data_rdd.filter(lambda x: x < 10).collect()
Out[6]:
In [6]:
data_rdd.top(5)
Out[6]:
In [7]:
# Move on to dataFrames
df = data_rdd.map(lambda x:[x]).toDF(['SeqNo']) # needs each row as a list
In [8]:
df.show(10)
In [9]:
df.filter(df.SeqNo <= 10).show()
In [10]:
import pyspark.sql.functions as F
df.withColumn("Square",F.pow(df.SeqNo,2)).show(10) # Normal pow doesn't take columns
In [11]:
# Reduce vs fold
rdd_1 = sc.parallelize([])
In [12]:
rdd_1.reduce(lambda a,b : a+b)
In [ ]:
rdd_1.take(10)
In [ ]:
rdd_1.fold(0,lambda a,b : a+b)
In [ ]:
rdd_2 = sc.parallelize([1,2])
In [ ]:
from operator import add
rdd_2.fold(0,add)
In [ ]:
rdd_x = sc.parallelize(['a','b','c'])
rdd_y = sc.parallelize([1,2,3])
In [ ]:
rdd_x.cartesian(rdd_y).take(20)
In [ ]: