In [1]:
# Initialization Spark in Python
from pyspark import SparkContext

In [2]:
# Setting Spark work
sc = SparkContext("local", "Hello World APP")

In [3]:
# Should be some file on your system
logFile = "/usr/apache/spark-2.0.2-bin-hadoop2.7/README.md"
logData = sc.textFile(logFile).cache()

In [4]:
numAs = logData.filter(lambda s: 'a' in s).count()
numBs = logData.filter(lambda s: 'b' in s).count()

In [5]:
print("Lines with a: %i, lines with b: %i" % (numAs, numBs))


Lines with a: 61, lines with b: 27

In [6]:
sc.stop()

In [ ]: