In [1]:
import urllib
access_key = 'YOUR_AWS_ACCESS_KEY'
secret_key = urllib.quote_plus('YOUR_AWS_SECRET_KEY')
bucket_name = 'YOUR_BUCKET_NAME'

awsBucket = 's3n://'+access_key+':'+secret_key+'@'+bucket_name+'/output'


awsFilepath = awsBucket+'/function_usage_data'
functionUsageDF = sqlContext.read \
    .format('com.databricks.spark.csv') \
    .options(header='true') \
    .load(awsFilepath,inferSchema='true')
functionUsageDF.registerTempTable('function_usage')
    
awsFilepath = awsBucket+'/function_time_data'
functionTimeDF = sqlContext.read \
    .format('com.databricks.spark.csv') \
    .options(header='true') \
    .load(awsFilepath,inferSchema='true')
functionTimeDF.registerTempTable('function_time')    
    
awsFilepath = awsBucket+'/function_option_data'
funOptDF = sqlContext.read \
    .format('com.databricks.spark.csv') \
    .options(header='true') \
    .load(awsFilepath,inferSchema='true')
funOptDF.registerTempTable('function_opt')

In [2]:
# Show available intervals (use the interval_id to select intervals on queries)
display(sqlContext.sql("SELECT DISTINCT interval_id, `interval` FROM function_time ORDER BY interval_id"))

In [3]:
display(functionUsageDF)

In [4]:
display(functionTimeDF.orderBy(["timeframe_id","function", "interval"]))

In [5]:
df2 = functionTimeDF \
        .orderBy("timeframe_id") \
        .groupBy(["group", "timeframe","timeframe_id"]) \
        .agg({"count":"sum","cpu_time":"sum"}) \
        .withColumnRenamed('sum(cpu_time)','cpu_time') \
        .withColumnRenamed('sum(count)','count')
display(df2
        .withColumn("cpu_time_hours",df2.cpu_time/3600) \
        .orderBy(["group","timeframe_id"]) \
        .drop("timeframe_id")
       )

In [6]:
display(funOptDF)

In [7]:
display(funOptDF.where(funOptDF.function_name == '--list-var-geno').where(funOptDF.interval_id == 1).limit(20))

In [8]: