In [1]:
# Initialization Spark in Python
from pyspark import SparkContext

In [2]:
# Setting Spark work
sc = SparkContext("local", "Hello World APP")

In [4]:
logFile = "/usr/apache/spark-2.0.2-bin-hadoop2.7/README.md"

In [5]:
# Create RDD strings
lines = sc.textFile(logFile)

In [6]:
# Transformation
pythonLines = lines.filter(lambda line: "Python" in line)

In [10]:
# Action
pythonLines.first()


Out[10]:
u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that'

In [12]:
# Persist
pythonLines.persist
pythonLines.count()
pythonLines.first()


Out[12]:
u'high-level APIs in Scala, Java, Python, and R, and an optimized engine that'

In [13]:
sc.stop()

In [ ]: