notebook.community

Edit and run



In [1]:

    
val textFile = sc.textFile("file:///usr/local/spark/README.md")



In [2]:

    
textFile.count()









    Out[2]:





104



In [3]:

    
textFile.first()









    Out[3]:





# Apache Spark



In [4]:

    
val linesWithSpark = textFile.filter(line => line.contains("Spark"))



In [5]:

    
textFile.filter(line => line.contains("Spark")).count()









    Out[5]:





20



In [6]:

    
textFile.map(line => line.split(" ").size).reduce((a, b) => if (a > b) a else b)









    Out[6]:





22



In [7]:

    
import java.lang.Math

textFile.map(line => line.split(" ").size).reduce((a, b) => Math.max(a, b))









    Out[7]:





22



In [8]:

    
val wordCounts = textFile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey((a, b) => a + b)



In [9]:

    
wordCounts.collect()









    Out[9]:





Array((package,1), (this,1), (Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version),1), (Because,1), (Python,2), (page](http://spark.apache.org/documentation.html).,1), (cluster.,1), (its,1), ([run,1), (general,3), (have,1), (pre-built,1), (YARN,,1), ([http://spark.apache.org/developer-tools.html](the,1), (changed,1), (locally,2), (sc.parallelize(1,1), (only,1), (locally.,1), (several,1), (This,2), (basic,1), (Configuration,1), (learning,,1), (documentation,3), (first,1), (graph,1), (Hive,2), (info,1), (["Specifying,1), ("yarn",1), ([params]`.,1), ([project,1), (prefer,1), (SparkPi,2), (<http://spark.apache.org/>,1), (engine,1), (version,1), (file,1), (documentation,,1), (MASTER,1), (example,3), (["Parallel,1), (are...



In [10]:

    
linesWithSpark.cache()









    Out[10]:





MapPartitionsRDD[2] at filter at <console>:21



In [11]:

    
linesWithSpark.count()









    Out[11]:





20