In [20]:
from pyspark.mllib.stat import Statistics
import math
In [12]:
logs = 'hdfs://namenode/magichour/tbird500k'
In [13]:
def rdd_LogLine(line):
l = int (len(line))
w = int (len(line.split()))
return [l,w]
sparkLogFile = sc.textFile(logs)
out = sparkLogFile.map(rdd_LogLine)
stats = Statistics.colStats(out)
In [16]:
print 'mean ',stats.mean()
print 'variance',stats.variance()
print 'max ',stats.max()
print 'min ',stats.min()
print 'count ',stats.count()