notebook.community

Edit and run



In [20]:

    
from pyspark.mllib.stat import Statistics
import math



In [12]:

    
logs = 'hdfs://namenode/magichour/tbird500k'



In [13]:

    
def rdd_LogLine(line):
    l = int (len(line))
    w = int (len(line.split()))
    return [l,w]

sparkLogFile = sc.textFile(logs)
out = sparkLogFile.map(rdd_LogLine)
stats = Statistics.colStats(out)



In [16]:

    
print 'mean    ',stats.mean()
print 'variance',stats.variance()
print 'max     ',stats.max()
print 'min     ',stats.min()
print 'count   ',stats.count()