Spark SQL Operation on Crime Dataset


In [ ]:
import csv
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from StringIO import StringIO
from datetime import *
from dateutil.parser import parse

Initialize contexts and input file:


In [ ]:
sc = pyspark.SparkContext('local[*]')
sqlContext = SQLContext(sc)

In [ ]:
crime_df = sqlContext.read.parquet("../../data/sf-crime/train.parquet")

crime_df.printSchema()
crime_df.registerTempTable("crime")
crime_df.count()

In [ ]:
crime_df.groupBy('PdDistrict').count().orderBy('count', ascending=False).show()

In [ ]:
# SQL fashion
sqlContext.sql("SELECT Category, count(*) as count FROM crime GROUP BY Category ORDER BY count DESC").show()