In [1]:
#Step 1 - Check spark version
#Type:
#sc.version
In [2]:
#Step 2 - Create RDD of Numbers 1-10
#Type:
#x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
#x_nbr_rdd = sc.parallelize(x)
In [3]:
#Step 2 - Extract first line
#Type:
#x_nbr_rdd.first()
In [4]:
#Step 2 - Extract first 5 lines
#Type:
#x_nbr_rdd.take(5)
In [5]:
#Step 2 - Create RDD String, Extract first line
#Type:
#y = ["Hello Spark!"]
#y_str_rdd = sc.parallelize(y)
#y_str_rdd.first()
In [6]:
#Step 3 - Create RDD String, Extract first line
#type:
#z = ["Hello World!, Hello Universe!, I love Spark"]
#z_str_rdd = sc.parallelize(z)
#z_str_rdd.first()
In [7]:
#Step 3 - Create RDD with object for each word, Extract first 7 words
#type:
#z_str2_rdd = z_str_rdd.flatMap(lambda line: line.split(" "))
#z_str2_rdd.take(7)
In [8]:
#Step 3 - Count of "Hello" words
#type:
#z_str3_rdd = z_str2_rdd.filter(lambda line: "Hello" in line)
#print "The count of words 'Hello' in: " + repr(z_str_rdd.first())
#print "Is: " + repr(z_str3_rdd.count())
In [9]:
#Step 3 - Count of "Spark" words
#type
#z_str4_rdd = z_str2_rdd.filter(lambda line: "Spark" in line)
#print "The count of words 'Spark' in: " + repr(z_str_rdd.first())
#print "Is: " + repr(z_str4_rdd.count())