In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')
In [76]:
# do something to prove it works
# For the DISCRETE uniform distributuion on range a-b,
# Mean = (a+b)/2, Variance = ((b - a + 1)**2 - 1) / 12
def discrete_uniform_experiment():
from math import sqrt
NUM = 1000000
SAMPLE = 0.1525
a, b = 0, NUM-1
mean = (a+b)/2
var = ((b - a + 1)**2 - 1) / 12
stdev = sqrt(var)
print("Discrete Uniform across %d-%d" % (a,b))
print(" Sampling at %.3f%% (with replacement)" % (SAMPLE * 100,))
print(" Analytic mean(stdev): %.3f(%.3f)" % (mean,stdev))
# We find an experiment mean and stdev by taking a sample from the
# entire distribution WITH REPLACEMENT. Note that we use cache()
# to execute multiple computations across the cluster.
rdd = sc.parallelize(range(NUM)).sample(True, SAMPLE).cache()
exp_count = rdd.count()
exp_mean = rdd.sum() / exp_count
exp_devs = rdd.map(lambda x: (x - exp_mean) ** 2)
exp_stdev = sqrt(exp_devs.sum() / (exp_count - 1))
print(" Experimental mean(stdev): %.3f(%.3f)" % (exp_mean, exp_stdev))
discrete_uniform_experiment()
In [4]:
# Quick example of word counts by line
# lines = sc.textFile("file:///tmp/sampleFile.txt")
lines = sc.parallelize([
"This is line one",
"This is line two",
"Last line."
])
def words(line_index):
line, index = line_index
return (index, line.split(" "))
lines.zipWithIndex().map(words).map(
lambda kv: (kv[0], len(kv[1]))
).reduceByKey(
lambda x,y: x+y
).collect()
Out[4]: