In [2]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib import stat
In [3]:
numbers = sc.textFile("../data/random_points.csv")
num_vectors = numbers.map(lambda row: row.split(",")).map(lambda row: map(float, row))\
.map(lambda row:Vectors.dense(row))
num_vectors.take(2)
Out[3]:
In [4]:
stats = stat.Statistics.colStats(num_vectors)
print "Mean: {}".format(stats.mean())
print "Variance: {}".format(stats.variance())
In [5]:
stats.min()
Out[5]:
In [6]:
stats.count()
Out[6]: