In [1]:
val textFile = sc.textFile("file:///usr/local/spark/README.md")

In [2]:
textFile.count()


Out[2]:
104

In [3]:
textFile.first()


Out[3]:
# Apache Spark

In [4]:
val linesWithSpark = textFile.filter(line => line.contains("Spark"))

In [5]:
textFile.filter(line => line.contains("Spark")).count()


Out[5]:
20

In [6]:
textFile.map(line => line.split(" ").size).reduce((a, b) => if (a > b) a else b)


Out[6]:
22

In [7]:
import java.lang.Math

textFile.map(line => line.split(" ").size).reduce((a, b) => Math.max(a, b))


Out[7]:
22

In [8]:
val wordCounts = textFile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey((a, b) => a + b)

In [9]:
wordCounts.collect()


Out[9]:
Array((package,1), (this,1), (Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version),1), (Because,1), (Python,2), (page](http://spark.apache.org/documentation.html).,1), (cluster.,1), (its,1), ([run,1), (general,3), (have,1), (pre-built,1), (YARN,,1), ([http://spark.apache.org/developer-tools.html](the,1), (changed,1), (locally,2), (sc.parallelize(1,1), (only,1), (locally.,1), (several,1), (This,2), (basic,1), (Configuration,1), (learning,,1), (documentation,3), (first,1), (graph,1), (Hive,2), (info,1), (["Specifying,1), ("yarn",1), ([params]`.,1), ([project,1), (prefer,1), (SparkPi,2), (<http://spark.apache.org/>,1), (engine,1), (version,1), (file,1), (documentation,,1), (MASTER,1), (example,3), (["Parallel,1), (are...

In [10]:
linesWithSpark.cache()


Out[10]:
MapPartitionsRDD[2] at filter at <console>:21

In [11]:
linesWithSpark.count()


Out[11]:
20