Season Overview Module

Season Overview module facilitates several insights and brief overview of matches of a given season through an interactive visualization.

Each block in the visualization represents a match held in the season. Its color is determined by winning team's color. Hover cursor over each block to see details of that match.


In [1]:
import pyspark
from pyspark import SparkContext 
from pyspark.sql import SQLContext
from pyspark.sql.types import *         # for defining schema with various datatypes
import pyspark.sql.functions as func    # for ETL, data processing on Dataframes

import pandas as pd                     # converting PysparkDF to PandasDF when passing it as a parameter to Bokeh invokes 

from datetime import *                  # for datetime datatype for schema
from dateutil.parser import parse       # for string parse to date

from bokeh.charts import Bar, output_file, show                    # creating bar charts, and displaying it
from bokeh.charts.attributes import cat                            # extracting column for 'label' category in bar charts
from bokeh.core.properties import field
from bokeh.io import push_notebook, show, output_notebook          # various output methods for jupyter notebook
from bokeh.models import Legend, LegendItem, HoverTool, ColumnDataSource       # for hover feature, and columnDS
from bokeh.models.glyphs import Rect
from bokeh.palettes import *                                       # brewer color palette
from bokeh.plotting import figure                                  # creating a figure variable
output_notebook()

sc = SparkContext()        # creating sparkcontext
sql = SQLContext(sc)       # creating SQLcontext


Loading BokehJS ...

In [2]:
#Extracting and Transforming csv data

data_path = "../input/csv/"                                # path directory to input csv files
match_rdd = sc.textFile(data_path + "matches.csv")         # reading csv files into RDD

match_header = match_rdd.filter(lambda l: "id,season" in l)     # storing the header tuple
match_no_header = match_rdd.subtract(match_header)              # subtracting it from RDD
match_temp_rdd = match_no_header.map(lambda k: k.split(','))\
.map(lambda p: (int(p[0]), p[1],p[2],parse(p[3]).date(),p[4]\
                ,p[5],p[6],p[7],p[8],p[9]=='1',p[10],int(p[11])\
                ,int(p[12]),p[13],p[14],p[15],p[16],p[17]))     # Transforming csv file data

match_df = sql.createDataFrame(match_temp_rdd, match_rdd.first().split(','))  # converting to PysparkDF
match_df = match_df.orderBy(match_df.id.asc())                                # asc sort by id

In [5]:
def getCleanRange(tmp_list, sort_req):              # for sanitizing fields
    item_range = []
    for item in tmp_list:
        if item[0]=='"':
            item_range.append(item[1:])
        else:
            item_range.append(item)
    if sort_req:
        item_range.sort()
    return item_range


def getRange(season, attr, distinct_req, sort_req):  # geting a list of range values
    if distinct_req:
        attr_df = match_df.filter(match_df.season == season).select(attr).distinct()
    else:
        attr_df = match_df.filter(match_df.season == season).select(attr)
    
    if sort_req:
        attr_df = attr_df.orderBy(attr)
        
    attr_range = attr_df.rdd.map(lambda x: str(x[0])).collect()
    return attr_range


def getAxisRange(season_num, attr):                  # get range values for x & y axes
    return [str(x) for x in getRange(season_num,attr, 1, 1) ]


def displaySeasonOverview(src, season_num, yrange, xrange):    # creating and displaying visualizations
    figure_season_overview = figure(title="Season Overview : "+season_num, tools="hover, save",\
               y_range=yrange, x_range=list(xrange), plot_width=1000, plot_height=500)
    
    figure_season_overview.xaxis.major_label_orientation = 45    # Configuring
    figure_season_overview.yaxis.axis_label = 'Stadium Cities'   # figure
    figure_season_overview.xaxis.axis_label = 'Dates'            # settings
    
    rect = Rect(x="dates", y="cities", width=0.9, height=0.9, fill_alpha=0.8, fill_color="type_color")
    rect_render = figure_season_overview.add_glyph(src, rect)
    
    legend = Legend(items=[ LegendItem(label=field("label"), renderers=[rect_render]) ])
    figure_season_overview.add_layout(legend, 'left')

    figure_season_overview.legend.background_fill_color = "grey"
    figure_season_overview.legend.background_fill_alpha = 0.1
    figure_season_overview.legend.border_line_color = "black"
    
    figure_season_overview.select_one(HoverTool).tooltips = [
                ("Date", "@dates"),
                ("Team1", "@team1"),                             # Configuring
                ("Team2", "@team2"),                             # Hover
                ("Venue", "@venues"),                            # Tool
                ("City", "@cities"),
                ("Winner", "@winners"),
                ("Man of the match","@player_of_match")
            ]
    handle_season_overview = show(figure_season_overview, notebook_handle=True)
                                                                 # displaying generated visualization
    
def getSeasonOverview(season_num):                      # primary module function that defines visualization schema,
    xrange = getAxisRange(season_num, "date")           # properties, colormaps, axes, and associated data(for hover tool)
    yrange = getAxisRange(season_num, "city")           # getting x & y axes ranges

    colorMap = {                                        # Colormap mapped to colors based on team jerseys
        ''                              : '#000000',
        'Chennai Super Kings'           : '#EED200',
        'Deccan Chargers'               : '#EA290B',
        'Delhi Daredevils'              : '#0043A8',
        'Gujarat Lions'                 : '#9467BD',
        'Kings XI Punjab'               : '#DB0033',
        'Kochi Tuskers Kerala'          : '#E377C2',
        'Kolkata Knight Riders'         : '#6600DE',
        'Mumbai Indians'                : '#0092CD',
        'Pune Warriors'                 : '#BCBD22',
        'Rajasthan Royals'              : '#B19237',
        'Rising Pune Supergiants'       : '#BCBD22',
        'Royal Challengers Bangalore'   : '#4FC730',
        'Sunrisers Hyderabad'           : '#EA290B'
    }

    src = ColumnDataSource(                             # Defines column data source to be utilized for visualization                       
        data=dict(                                      # using Bokeh libraries
            dates = [str(x) for x in getRange\
                     (season_num,"date",0,0)],
            venues = [str(x) for x in getCleanRange\
                      (getRange(season_num,"venue",0,0), 0)],
            cities = getRange(season_num,"city",0,0),
            team1 = getRange(season_num,"team1",0,0),
            team2 = getRange(season_num,"team2",0,0),
            toss_winner = getRange\
            (season_num,"toss_winner",0,0),
            toss_decision = getRange\
            (season_num,"toss_decision",0,0),
            result = getRange(season_num,"result",0,0),
            winners = getRange(season_num,"winner",0,0),
            win_by_runs = getRange\
            (season_num,"win_by_runs",0,0),
            win_by_wickets = getRange\
            (season_num,"win_by_wickets",0,0),
            player_of_match = getRange\
            (season_num,"player_of_match",0,0),
            umpire1 = getRange(season_num,"umpire1",0,0),
            umpire2 = getRange(season_num,"umpire2",0,0),
            umpire3 = getRange(season_num,"umpire3",0,0),        
            type_color=[colorMap[x] for x in \
                        getRange(season_num,"winner",0,0)],
            label = [x if x!="" else "Tie" for x in getRange(season_num,"winner",0,0)]
        )
    )
    displaySeasonOverview(src, season_num, yrange, xrange)

In [6]:
getSeasonOverview("2013")        # function to call season overview module



In [ ]: