In [38]:

    
import findspark
findspark.init()


import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('reddit relative scores').getOrCreate()

assert sys.version_info >= (3, 4) # make sure we have Python 3.4+
assert spark.version >= '2.1' # make sure we have Spark 2.1+

import string, re
wordbreak = r'[%s\s]+' % (re.escape(string.punctuation),)  # regex that matches spaces and/or punctuation

def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]
    in_directory = "wordcount-1"
    out_directory = "output"

    
    words = spark.read.text(in_directory)
    words.show()
    sperated = words.select(functions.explode(functions.split(functions.lower(words["value"]),wordbreak)).alias("word"))
#     sperated.show()
    word_count = sperated.groupby("word").agg(functions.count("word").alias("count"))
#     word_count.show()
    sorted_word_count = word_count.orderBy(functions.desc("count"),"word").filter(word_count["word"] !="")
    
    sorted_word_count.write.csv(out_directory, mode = "overwrite")
if __name__=='__main__':
    main()









    



+--------------------+
|               value|
+--------------------+
|[Sense and Sensib...|
|                    |
|           CHAPTER 1|
|                    |
|                    |
|The family of Das...|
|Their estate was ...|
|in the centre of ...|
|they had lived in...|
|the general good ...|
|The late owner of...|
|to a very advance...|
|had a constant co...|
|But her death, wh...|
|produced a great ...|
|her loss, he invi...|
|of his nephew Mr....|
|of the Norland es...|
|to bequeath it.  ...|
|and their childre...|
+--------------------+
only showing top 20 rows