Sample PySpark Notebook

See the README for more details a link to the home repo


In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [76]:
# do something to prove it works

# For the DISCRETE uniform distributuion on range a-b,
# Mean = (a+b)/2, Variance = ((b - a + 1)**2 - 1) / 12

def discrete_uniform_experiment():
    from math import sqrt
    NUM = 1000000
    SAMPLE = 0.1525
    a, b = 0, NUM-1
    mean = (a+b)/2
    var = ((b - a + 1)**2 - 1) / 12
    stdev = sqrt(var)
    print("Discrete Uniform across %d-%d" % (a,b))
    print("  Sampling at %.3f%% (with replacement)" % (SAMPLE * 100,))
    print("  Analytic     mean(stdev): %.3f(%.3f)" % (mean,stdev))

    # We find an experiment mean and stdev by taking a sample from the
    # entire distribution WITH REPLACEMENT. Note that we use cache()
    # to execute multiple computations across the cluster.
    rdd = sc.parallelize(range(NUM)).sample(True, SAMPLE).cache()
    exp_count = rdd.count()
    exp_mean = rdd.sum() / exp_count
    exp_devs = rdd.map(lambda x: (x - exp_mean) ** 2)
    exp_stdev = sqrt(exp_devs.sum() / (exp_count - 1))
    print("  Experimental mean(stdev): %.3f(%.3f)" % (exp_mean, exp_stdev))

discrete_uniform_experiment()


Discrete Uniform across 0-999999
  Sampling at 15.250% (with replacement)
  Analytic     mean(stdev): 499999.500(288675.135)
  Experimental mean(stdev): 500801.263(288604.931)

In [4]:
# Quick example of word counts by line

# lines = sc.textFile("file:///tmp/sampleFile.txt")
lines = sc.parallelize([
    "This is line one",
    "This is line two",
    "Last line."
])

def words(line_index):
    line, index = line_index
    return (index, line.split(" "))

lines.zipWithIndex().map(words).map(
    lambda kv: (kv[0], len(kv[1]))
).reduceByKey(
    lambda x,y: x+y
).collect()


Out[4]:
[(0, 4), (1, 4), (2, 2)]