In [ ]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = \
'--conf spark.cassandra.connection.host=cassandra --packages com.datastax.spark:spark-cassandra-connector_2.11:2.0.2 pyspark-shell'
In [ ]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.types import *
In [ ]:
sc = SparkContext(appName="BigDataRiver")
sc.setLogLevel("WARN")
sc.setCheckpointDir('checkpoint/')
sql = SQLContext(sc)
In [ ]:
def usersWhoBoughtXAlsoBought(df):
productDf = df.select('user_id', 'product')
otherProductDf = productDf.toDF('user_id', 'other_product')
matchedProductsDf = productDf.join(otherProductDf, otherProductDf['user_id'] == productDf['user_id'], 'inner').\
filter("`product` != `other_product`").select('product','other_product').\
groupby('product','other_product').count().toDF("product","other_product","count")
return matchedProductsDf
In [ ]:
def selectTopProducts(df):
df.registerTempTable("products")
topProductsDf = sql.sql("""
SELECT
*,
ROW_NUMBER() OVER(PARTITION BY product ORDER BY count DESC) rn
FROM products
""").where("rn <= 5").groupBy("product").agg(F.collect_list("other_product").alias("other_products"))
return topProductsDf
In [ ]:
def processBatch():
allUserProductsDf = sql.read.format("org.apache.spark.sql.cassandra").\
options(table="all_user_products", keyspace="bdr").load().cache()
topDf = selectTopProducts(usersWhoBoughtXAlsoBought(allUserProductsDf))
topDf.show()
topDf.write.format("org.apache.spark.sql.cassandra").\
mode('append').options(table="top_other_products_batch", keyspace="bdr").save()
In [ ]:
processBatch()
In [ ]: