In [1]:
import pandas as pd
def rows_to_df(rows):
return pd.DataFrame(map(lambda e: e.asDict(), rows))
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)
rdd = sqlc.parquetFile("./data/movies")
rdd.registerTempTable("events")
In [2]:
summary = sqlc.sql("SELECT "
"entityType, event, targetEntityType, COUNT(*) AS c "
"FROM events "
"GROUP BY entityType, event, targetEntityType").collect()
rows_to_df(summary)
Out[2]:
In [3]:
import matplotlib.pyplot as plt
count = map(lambda e: e.c, summary)
event = map(lambda e: "%s (%d)" % (e.event, e.c), summary)
colors = ['gold', 'lightskyblue']
plt.pie(count, labels=event, colors=colors, startangle=90, autopct="%1.1f%%")
plt.axis('equal')
plt.show()
In [4]:
ratings = sqlc.sql("SELECT properties.rating AS r, COUNT(*) AS c "
"FROM events "
"WHERE properties.rating IS NOT NULL "
"GROUP BY properties.rating "
"ORDER BY r").collect()
count = map(lambda e: e.c, ratings)
rating = map(lambda e: "%s (%d)" % (e.r, e.c), ratings)
colors = ['yellowgreen', 'plum', 'gold', 'lightskyblue', 'lightcoral']
plt.pie(count, labels=rating, colors=colors, startangle=90,
autopct="%1.1f%%")
plt.axis('equal')
plt.show()
In [ ]: