Season Overview module facilitates several insights and brief overview of matches of a given season through an interactive visualization.
Each block in the visualization represents a match held in the season. Its color is determined by winning team's color. Hover cursor over each block to see details of that match.
In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import * # for defining schema with various datatypes
import pyspark.sql.functions as func # for ETL, data processing on Dataframes
import pandas as pd # converting PysparkDF to PandasDF when passing it as a parameter to Bokeh invokes
from datetime import * # for datetime datatype for schema
from dateutil.parser import parse # for string parse to date
from bokeh.charts import Bar, output_file, show # creating bar charts, and displaying it
from bokeh.charts.attributes import cat # extracting column for 'label' category in bar charts
from bokeh.core.properties import field
from bokeh.io import push_notebook, show, output_notebook # various output methods for jupyter notebook
from bokeh.models import Legend, LegendItem, HoverTool, ColumnDataSource # for hover feature, and columnDS
from bokeh.models.glyphs import Rect
from bokeh.palettes import * # brewer color palette
from bokeh.plotting import figure # creating a figure variable
output_notebook()
sc = SparkContext() # creating sparkcontext
sql = SQLContext(sc) # creating SQLcontext
In [2]:
#Extracting and Transforming csv data
data_path = "../input/csv/" # path directory to input csv files
match_rdd = sc.textFile(data_path + "matches.csv") # reading csv files into RDD
match_header = match_rdd.filter(lambda l: "id,season" in l) # storing the header tuple
match_no_header = match_rdd.subtract(match_header) # subtracting it from RDD
match_temp_rdd = match_no_header.map(lambda k: k.split(','))\
.map(lambda p: (int(p[0]), p[1],p[2],parse(p[3]).date(),p[4]\
,p[5],p[6],p[7],p[8],p[9]=='1',p[10],int(p[11])\
,int(p[12]),p[13],p[14],p[15],p[16],p[17])) # Transforming csv file data
match_df = sql.createDataFrame(match_temp_rdd, match_rdd.first().split(',')) # converting to PysparkDF
match_df = match_df.orderBy(match_df.id.asc()) # asc sort by id
In [5]:
def getCleanRange(tmp_list, sort_req): # for sanitizing fields
item_range = []
for item in tmp_list:
if item[0]=='"':
item_range.append(item[1:])
else:
item_range.append(item)
if sort_req:
item_range.sort()
return item_range
def getRange(season, attr, distinct_req, sort_req): # geting a list of range values
if distinct_req:
attr_df = match_df.filter(match_df.season == season).select(attr).distinct()
else:
attr_df = match_df.filter(match_df.season == season).select(attr)
if sort_req:
attr_df = attr_df.orderBy(attr)
attr_range = attr_df.rdd.map(lambda x: str(x[0])).collect()
return attr_range
def getAxisRange(season_num, attr): # get range values for x & y axes
return [str(x) for x in getRange(season_num,attr, 1, 1) ]
def displaySeasonOverview(src, season_num, yrange, xrange): # creating and displaying visualizations
figure_season_overview = figure(title="Season Overview : "+season_num, tools="hover, save",\
y_range=yrange, x_range=list(xrange), plot_width=1000, plot_height=500)
figure_season_overview.xaxis.major_label_orientation = 45 # Configuring
figure_season_overview.yaxis.axis_label = 'Stadium Cities' # figure
figure_season_overview.xaxis.axis_label = 'Dates' # settings
rect = Rect(x="dates", y="cities", width=0.9, height=0.9, fill_alpha=0.8, fill_color="type_color")
rect_render = figure_season_overview.add_glyph(src, rect)
legend = Legend(items=[ LegendItem(label=field("label"), renderers=[rect_render]) ])
figure_season_overview.add_layout(legend, 'left')
figure_season_overview.legend.background_fill_color = "grey"
figure_season_overview.legend.background_fill_alpha = 0.1
figure_season_overview.legend.border_line_color = "black"
figure_season_overview.select_one(HoverTool).tooltips = [
("Date", "@dates"),
("Team1", "@team1"), # Configuring
("Team2", "@team2"), # Hover
("Venue", "@venues"), # Tool
("City", "@cities"),
("Winner", "@winners"),
("Man of the match","@player_of_match")
]
handle_season_overview = show(figure_season_overview, notebook_handle=True)
# displaying generated visualization
def getSeasonOverview(season_num): # primary module function that defines visualization schema,
xrange = getAxisRange(season_num, "date") # properties, colormaps, axes, and associated data(for hover tool)
yrange = getAxisRange(season_num, "city") # getting x & y axes ranges
colorMap = { # Colormap mapped to colors based on team jerseys
'' : '#000000',
'Chennai Super Kings' : '#EED200',
'Deccan Chargers' : '#EA290B',
'Delhi Daredevils' : '#0043A8',
'Gujarat Lions' : '#9467BD',
'Kings XI Punjab' : '#DB0033',
'Kochi Tuskers Kerala' : '#E377C2',
'Kolkata Knight Riders' : '#6600DE',
'Mumbai Indians' : '#0092CD',
'Pune Warriors' : '#BCBD22',
'Rajasthan Royals' : '#B19237',
'Rising Pune Supergiants' : '#BCBD22',
'Royal Challengers Bangalore' : '#4FC730',
'Sunrisers Hyderabad' : '#EA290B'
}
src = ColumnDataSource( # Defines column data source to be utilized for visualization
data=dict( # using Bokeh libraries
dates = [str(x) for x in getRange\
(season_num,"date",0,0)],
venues = [str(x) for x in getCleanRange\
(getRange(season_num,"venue",0,0), 0)],
cities = getRange(season_num,"city",0,0),
team1 = getRange(season_num,"team1",0,0),
team2 = getRange(season_num,"team2",0,0),
toss_winner = getRange\
(season_num,"toss_winner",0,0),
toss_decision = getRange\
(season_num,"toss_decision",0,0),
result = getRange(season_num,"result",0,0),
winners = getRange(season_num,"winner",0,0),
win_by_runs = getRange\
(season_num,"win_by_runs",0,0),
win_by_wickets = getRange\
(season_num,"win_by_wickets",0,0),
player_of_match = getRange\
(season_num,"player_of_match",0,0),
umpire1 = getRange(season_num,"umpire1",0,0),
umpire2 = getRange(season_num,"umpire2",0,0),
umpire3 = getRange(season_num,"umpire3",0,0),
type_color=[colorMap[x] for x in \
getRange(season_num,"winner",0,0)],
label = [x if x!="" else "Tie" for x in getRange(season_num,"winner",0,0)]
)
)
displaySeasonOverview(src, season_num, yrange, xrange)
In [6]:
getSeasonOverview("2013") # function to call season overview module
In [ ]: