In [2]:
import findspark
findspark.init()

import sys
from pyspark.sql import SparkSession, functions, types

spark = SparkSession.builder.appName('first Spark app').getOrCreate()

assert sys.version_info >= (3, 4) # make sure we have Python 3.4+
assert spark.version >= '2.1' # make sure we have Spark 2.1+


schema = types.StructType([
    types.StructField('id', types.IntegerType(), False),
    types.StructField('x', types.FloatType(), False),
    types.StructField('y', types.FloatType(), False),
    types.StructField('z', types.FloatType(), False),
])


def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]
    in_directory = "xyz-1"
    out_directory = "output"
    # Read the data from the JSON files
    xyz = spark.read.json(in_directory, schema=schema)
#     xyz.show(); return
    xyz.show()
    # Create a DF with what we need: x, (soon y,) and id%10 which we'll aggregate by.
    with_bins = xyz.select(
        xyz['x'],
        # TODO: also the y values
        xyz['y'],
        (xyz['id'] % 10).alias('bin'),
        
    )
    #with_bins.show(); return

    # Aggregate by the bin number.
    grouped = with_bins.groupBy(with_bins['bin'])
    groups = grouped.agg(
        functions.sum(with_bins['x']),
        # TODO: output the average y value. Hint: avg
        functions.avg(xyz['y']),

        functions.count('*'))
    # We know groups has <=10 rows, so it can safely be moved into two partitions.
    groups = groups.sort(groups['bin']).coalesce(2)

    groups.write.csv(out_directory, compression=None, mode='overwrite')


if __name__=='__main__':
    main()


+---+-----+-----+------+
| id|    x|    y|     z|
+---+-----+-----+------+
|  2|84.46| 6.69|254.14|
|  6|40.48|29.49|122.14|
| 10|58.72| 0.77|176.91|
| 14|88.65|11.28| 266.8|
| 18|74.21|46.04| 223.5|
| 22| 91.8|31.11|276.46|
| 26|52.55| 5.15|159.05|
| 30|74.76|33.39|224.85|
| 34|41.58| 3.37|125.34|
| 38|34.43|26.08|104.57|
| 42|33.58|44.48|101.43|
| 46|56.65|16.08|171.43|
| 50|53.59|14.41|161.49|
| 54|19.82| 23.0| 60.24|
| 58|55.53| 8.66|167.34|
| 62|24.45| 0.01| 74.59|
| 66|16.43| 37.5| 50.29|
| 70|39.49| 27.3|119.71|
| 74|74.77|42.21|225.21|
| 78|46.01|15.77|138.61|
+---+-----+-----+------+
only showing top 20 rows

200