In [1]:
import sys

from pyspark import SparkContext, SparkConf

if __name__ == "__main__":

  # create Spark context with Spark configuration
  conf = SparkConf().setAppName("Spark Count")
  sc = SparkContext(conf=conf)

  # get threshold
  threshold = int(sys.argv[2])

  # read in text file and split each document into words
  tokenized = sc.textFile(sys.argv[1]).flatMap(lambda line: line.split(" "))

  # count the occurrence of each word
  wordCounts = tokenized.map(lambda word: (word, 1)).reduceByKey(lambda v1,v2:v1 +v2)

  # filter out words with fewer than threshold occurrences
  filtered = wordCounts.filter(lambda pair:pair[1] >= threshold)

  # count characters
  charCounts = filtered.flatMap(lambda pair:pair[0]).map(lambda c: c).map(lambda c: (c, 1)).reduceByKey(lambda v1,v2:v1 +v2)

  list = charCounts.collect()
  print (repr(list)[1:-1])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-1-8cb1e1c08f01> in <module>()
      7   # create Spark context with Spark configuration
      8   conf = SparkConf().setAppName("Spark Count")
----> 9   sc = SparkContext(conf=conf)
     10 
     11   # get threshold

/usr/local/spark/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
    110         """
    111         self._callsite = first_spark_call() or CallSite(None, None, None)
--> 112         SparkContext._ensure_initialized(self, gateway=gateway)
    113         try:
    114             self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,

/usr/local/spark/python/pyspark/context.py in _ensure_initialized(cls, instance, gateway)
    257                         " created by %s at %s:%s "
    258                         % (currentAppName, currentMaster,
--> 259                             callsite.function, callsite.file, callsite.linenum))
    260                 else:
    261                     SparkContext._active_spark_context = instance

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=PySparkShell, master=local[*]) created by <module> at /Applications/anaconda/lib/python3.5/site-packages/IPython/utils/py3compat.py:186 

In [2]:
sc


Out[2]:
<pyspark.context.SparkContext at 0x1073bdfd0>

In [4]:



<bound method SparkContext.binaryRecords of <pyspark.context.SparkContext object at 0x1073bdfd0>>

In [ ]: