In [2]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib import stat

In [3]:
numbers = sc.textFile("../data/random_points.csv")
num_vectors = numbers.map(lambda row: row.split(",")).map(lambda row: map(float, row))\
  .map(lambda row:Vectors.dense(row))
num_vectors.take(2)


Out[3]:
[DenseVector([67.3736, 173.5864]), DenseVector([80.7324, 83.4922])]

In [4]:
stats = stat.Statistics.colStats(num_vectors)
print "Mean: {}".format(stats.mean())
print "Variance: {}".format(stats.variance())


Mean: [ 103.34258003   98.03848728]
Variance: [ 3276.02491363  3323.28180672]

In [5]:
stats.min()


Out[5]:
array([ 0.10170301,  0.6833904 ])

In [6]:
stats.count()


Out[6]:
1000L