notebook.community

Edit and run



In [1]:

    
# Initialization Spark in Python
from pyspark import SparkContext



In [2]:

    
# Setting Spark work
sc = SparkContext("local", "Hello World APP")



In [4]:

    
logFile = "/usr/apache/spark-2.0.2-bin-hadoop2.7/README.md"



In [5]:

    
# Create RDD strings
lines = sc.textFile(logFile)



In [6]:

    
# Transformation
pythonLines = lines.filter(lambda line: "Python" in line)



In [10]:

    
# Action
pythonLines.first()









    Out[10]:





u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that'



In [12]:

    
# Persist
pythonLines.persist
pythonLines.count()
pythonLines.first()









    Out[12]:





u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that'



In [13]:

    
sc.stop()



In [ ]: