In [28]:
import findspark
findspark.init()

import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('reddit averages').getOrCreate()

assert sys.version_info >= (3, 4) # make sure we have Python 3.4+
assert spark.version >= '2.1' # make sure we have Spark 2.1+


schema = types.StructType([ # commented-out fields won't be read
    #types.StructField('archived', types.BooleanType(), False),
    #types.StructField('author', types.StringType(), False),
    #types.StructField('author_flair_css_class', types.StringType(), False),
    #types.StructField('author_flair_text', types.StringType(), False),
    #types.StructField('body', types.StringType(), False),
    #types.StructField('controversiality', types.LongType(), False),
    #types.StructField('created_utc', types.StringType(), False),
    #types.StructField('distinguished', types.StringType(), False),
    #types.StructField('downs', types.LongType(), False),
    #types.StructField('edited', types.StringType(), False),
    #types.StructField('gilded', types.LongType(), False),
    #types.StructField('id', types.StringType(), False),
    #types.StructField('link_id', types.StringType(), False),
    #types.StructField('name', types.StringType(), False),
    #types.StructField('parent_id', types.StringType(), True),
    #types.StructField('retrieved_on', types.LongType(), False),
    types.StructField('score', types.LongType(), False),
    #types.StructField('score_hidden', types.BooleanType(), False),
    types.StructField('subreddit', types.StringType(), False),
    #types.StructField('subreddit_id', types.StringType(), False),
    #types.StructField('ups', types.LongType(), False),
])


def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]
    in_directory = "reddit-2"
    out_directory = "output"
    comments = spark.read.json(in_directory, schema=schema)
    comments.show()
    
    # TODO: calculate averages, sort by subreddit. Sort by average score and output that too.
    grouped = comments.groupby("subreddit").agg(functions.avg("score").alias("avg(score)"))
    averages_by_subreddit = grouped.sort("subreddit")
    averages_by_score = grouped.sort(functions.desc("avg(score)"))
    averages_by_subreddit.show()
    averages_by_score.show()

    averages_by_subreddit.write.csv(out_directory + '-subreddit', mode='overwrite')
    averages_by_score.write.csv(out_directory + '-score', mode='overwrite')
    

if __name__=='__main__':
    main()


+-----+-------------------+
|score|          subreddit|
+-----+-------------------+
|    3|MechanicalKeyboards|
|    1|MechanicalKeyboards|
|    1|MechanicalKeyboards|
|    1|MechanicalKeyboards|
|    1|MechanicalKeyboards|
|    2|MechanicalKeyboards|
|    1|            Cameras|
|    6|MechanicalKeyboards|
|    1|            surfing|
|    1|MechanicalKeyboards|
|    1|MechanicalKeyboards|
|    0|MechanicalKeyboards|
|    6|MechanicalKeyboards|
|    0|MechanicalKeyboards|
|    1|            surfing|
|    2|MechanicalKeyboards|
|    1|MechanicalKeyboards|
|    1|            surfing|
|   34|MechanicalKeyboards|
|    6|MechanicalKeyboards|
+-----+-------------------+
only showing top 20 rows

+-------------------+------------------+
|          subreddit|        avg(score)|
+-------------------+------------------+
|            Cameras|1.4290993071593534|
|          Genealogy| 1.827992898207643|
|MechanicalKeyboards|2.2637818544562363|
|          optometry| 1.850418827283606|
|              scala|2.1949233716475094|
|            surfing|2.4909639027147543|
|               xkcd|6.4502510693695365|
|              zelda|3.6281154427531903|
+-------------------+------------------+

+-------------------+------------------+
|          subreddit|        avg(score)|
+-------------------+------------------+
|               xkcd|6.4502510693695365|
|              zelda|3.6281154427531903|
|            surfing|2.4909639027147543|
|MechanicalKeyboards|2.2637818544562363|
|              scala|2.1949233716475094|
|          optometry| 1.850418827283606|
|          Genealogy| 1.827992898207643|
|            Cameras|1.4290993071593534|
+-------------------+------------------+