In [1]:
# Initialization Spark in Python
from pyspark import SparkContext
In [2]:
# Setting Spark work
sc = SparkContext("local", "Hello World APP")
In [4]:
logFile = "/usr/apache/spark-2.0.2-bin-hadoop2.7/README.md"
In [5]:
# Create RDD strings
lines = sc.textFile(logFile)
In [6]:
# Transformation
pythonLines = lines.filter(lambda line: "Python" in line)
In [10]:
# Action
pythonLines.first()
Out[10]:
In [12]:
# Persist
pythonLines.persist
pythonLines.count()
pythonLines.first()
Out[12]:
In [13]:
sc.stop()
In [ ]: