In [20]:
from pyspark.mllib.stat import Statistics
import math

In [12]:
logs = 'hdfs://namenode/magichour/tbird500k'

In [13]:
def rdd_LogLine(line):
    l = int (len(line))
    w = int (len(line.split()))
    return [l,w]

sparkLogFile = sc.textFile(logs)
out = sparkLogFile.map(rdd_LogLine)
stats = Statistics.colStats(out)

In [16]:
print 'mean    ',stats.mean()
print 'variance',stats.variance()
print 'max     ',stats.max()
print 'min     ',stats.min()
print 'count   ',stats.count()