Ryft: Python examples for DataFrames

To run these examples you should have SPARK_CLASSPATH environment variable point to spark-ryft-connector jar.

Prepare SparkContext, SQLContext and Schema



In [ ]:

    
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, BooleanType, DoubleType, ByteType

sparkConf = SparkConf().setAppName("DataFrameExample").setMaster("local[2]")

sc = SparkContext(conf=sparkConf)
sqlContext = SQLContext(sc)

schema = StructType([
    StructField("Arrest", BooleanType()),
    StructField("Beat", StringType()),
    StructField("Block", StringType()),
    StructField("CaseNumber", IntegerType()),
    StructField("CommunityArea", IntegerType()),
    StructField("Date", StringType()),
    StructField("Description", StringType()),
    StructField("District", IntegerType()),
    StructField("Domestic", BooleanType()),
    StructField("FBICode", IntegerType()),
    StructField("ID", StringType()),
    StructField("IUCR", IntegerType()),
    StructField("Latitude", DoubleType()),
    StructField("Location", StringType()),
    StructField("LocationDescription", StringType()),
    StructField("Longitude", DoubleType()),
    StructField("PrimaryType", StringType()),
    StructField("UpdatedOn", StringType()),
    StructField("Ward", IntegerType()),
    StructField("XCoordinate", IntegerType()),
    StructField("YCoordinate", IntegerType()),
    StructField("Year", IntegerType()),
    StructField("_index", StructType([
        StructField("file", StringType()),
        StructField("offset", StringType()),
        StructField("length", IntegerType()),
        StructField("fuzziness", ByteType())
    ]))
])

df = sqlContext.read.format("com.ryft.spark.connector.sql")\
               .schema(schema)\
               .option("files", "*.pcrime")\
               .load()
df.registerTempTable("temp_table")

print "SparkContext and SQLContext are created"

Example #1



In [ ]:

    
result = sqlContext.sql("select Date, ID, Description, Arrest from temp_table\
       where Description LIKE '%VEHICLE%'\
          AND (Date LIKE '%04/15/2015%' OR Date LIKE '%04/14/2015%' OR Date LIKE '%04/13/2015%')\
          AND Arrest = true\
       ORDER BY Date")

result.show(truncate=False)

Example #2



In [ ]:

    
result = sqlContext.sql("select Date, ID, Description from temp_table\
       where Description not LIKE '%VEHICLE%'\
          AND (Date LIKE '%04/13/2015 01%')\
          AND Arrest = true\
       ORDER BY Date")

result.show(truncate=False)

Stop active SparkContext



In [ ]:

    
sc.stop()
print "SparkContext is stopped. Now you can create a new one"



In [ ]: