In [1]:
import pandas as pd
def rows_to_df(rows):
    return pd.DataFrame(map(lambda e: e.asDict(), rows))
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)
rdd = sqlc.parquetFile("./data/movies")
rdd.registerTempTable("events")

In [2]:
summary = sqlc.sql("SELECT "
                   "entityType, event, targetEntityType, COUNT(*) AS c "
                   "FROM events "
                   "GROUP BY entityType, event, targetEntityType").collect()
rows_to_df(summary)


Out[2]:
c entityType event targetEntityType
0 768 user buy item
1 733 user rate item

In [3]:
import matplotlib.pyplot as plt
count = map(lambda e: e.c, summary)
event = map(lambda e: "%s (%d)" % (e.event, e.c), summary)
colors = ['gold', 'lightskyblue']
plt.pie(count, labels=event, colors=colors, startangle=90, autopct="%1.1f%%")
plt.axis('equal')
plt.show()

In [4]:
ratings = sqlc.sql("SELECT properties.rating AS r, COUNT(*) AS c "
                   "FROM events "
                   "WHERE properties.rating IS NOT NULL "
                   "GROUP BY properties.rating "
                   "ORDER BY r").collect()
count = map(lambda e: e.c, ratings)
rating = map(lambda e: "%s (%d)" % (e.r, e.c), ratings)
colors = ['yellowgreen', 'plum', 'gold', 'lightskyblue', 'lightcoral']
plt.pie(count, labels=rating, colors=colors, startangle=90,
        autopct="%1.1f%%")
plt.axis('equal')
plt.show()

In [ ]: