In [ ]:
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
In [ ]:
sqlContext
In [ ]:
from pyspark.sql import HiveContext, Row
sqlContext= HiveContext(sc)
In [ ]:
sqlContext
In [ ]:
jsonfile = "file:///opt/spark-1.4.1-bin-hadoop2.6/examples/src/main/resources/people.json"
df = sqlContext.read.load(jsonfile, format="json")
In [ ]:
# TODO: Replace <FILL IN> with appropriate code
df.<FILL IN>
In [ ]:
#print df's schema
df.printSchema()
In [ ]:
sqlContext.sql("SHOW TABLES").show()
In [ ]:
sqlContext.sql("SELECT * FROM pixnet_user_log_1000").printSchema()
In [ ]:
from datetime import datetime
start_time = datetime.now()
df2 = sqlContext.sql("SELECT * FROM pixnet_user_log_1000")
end_time = datetime.now()
print df2.count()
print('Duration: {}'.format(end_time - start_time))
In [ ]:
df2.select('time').show(2)
In [ ]:
#registers this RDD as a temporary table using the given name.
df2.registerTempTable("people")
# Create an UDF for how long some text is
# example from user guide, length function
sqlContext.registerFunction("strLenPython", lambda x: len(x))
# split function for parser
sqlContext.registerFunction("strDate", lambda x: x.split("T")[0])
# put udf with expected columns
results = sqlContext.sql("SELECT author, \
strDate(time) AS dt, \
strLenPython(action) AS lenAct \
FROM people")
In [ ]:
# print top 5 results
results.show(5)
In [19]:
sqlContext.cacheTable("people")
In [ ]:
start_time = datetime.now()
sqlContext.sql("SELECT * FROM people").count()
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
In [ ]:
sqlContext.sql("SELECT strDate(time) AS dt,\
count(distinct author) AS cnt \
FROM people \
GROUP BY strDate(time)").show(5)
In [ ]:
sqlContext.uncacheTable("people")
In [ ]:
# TODO: Replace <FILL IN> with appropriate code
result = <FILL IN>
In [23]:
sc.stop()