In [1]:
import urllib
access_key = 'YOUR_AWS_ACCESS_KEY'
secret_key = urllib.quote_plus('YOUR_AWS_SECRET_KEY')
bucket_name = 'YOUR_BUCKET_NAME'
awsBucket = 's3n://'+access_key+':'+secret_key+'@'+bucket_name+'/output'
awsFilepath = awsBucket+'/function_usage_data'
functionUsageDF = sqlContext.read \
.format('com.databricks.spark.csv') \
.options(header='true') \
.load(awsFilepath,inferSchema='true')
functionUsageDF.registerTempTable('function_usage')
awsFilepath = awsBucket+'/function_time_data'
functionTimeDF = sqlContext.read \
.format('com.databricks.spark.csv') \
.options(header='true') \
.load(awsFilepath,inferSchema='true')
functionTimeDF.registerTempTable('function_time')
awsFilepath = awsBucket+'/function_option_data'
funOptDF = sqlContext.read \
.format('com.databricks.spark.csv') \
.options(header='true') \
.load(awsFilepath,inferSchema='true')
funOptDF.registerTempTable('function_opt')
In [2]:
# Show available intervals (use the interval_id to select intervals on queries)
display(sqlContext.sql("SELECT DISTINCT interval_id, `interval` FROM function_time ORDER BY interval_id"))
In [3]:
display(functionUsageDF)
In [4]:
display(functionTimeDF.orderBy(["timeframe_id","function", "interval"]))
In [5]:
df2 = functionTimeDF \
.orderBy("timeframe_id") \
.groupBy(["group", "timeframe","timeframe_id"]) \
.agg({"count":"sum","cpu_time":"sum"}) \
.withColumnRenamed('sum(cpu_time)','cpu_time') \
.withColumnRenamed('sum(count)','count')
display(df2
.withColumn("cpu_time_hours",df2.cpu_time/3600) \
.orderBy(["group","timeframe_id"]) \
.drop("timeframe_id")
)
In [6]:
display(funOptDF)
In [7]:
display(funOptDF.where(funOptDF.function_name == '--list-var-geno').where(funOptDF.interval_id == 1).limit(20))
In [8]: