In [ ]:
from pyspark import SparkContext
#sc =SparkContext()
lines = sc.textFile("/home/ubuntu/sparkipynb/sparknotebook/dataFile/sherlock.txt",1)

In [ ]:
frequencies = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

In [ ]:
frequencies.collect()

In [ ]:
frequencies.count()

In [ ]:
output = frequencies.map(lambda (k,v): (v,k)).sortByKey(False).take(20)

In [ ]:
for (count, word) in output:
        print "%i: %s" % (count, word)