Overall Standings & Performance Consistency Module

Overall Standings module produces chart results for a particular season on the basis of #wins. Season is required as an input.

Performance Consistency module tracks a team's performance improvement consistency across all the seasons. Teams that have played for less than four season are not considered.


In [1]:
import pyspark                      
from pyspark import SparkContext 
from pyspark.sql import SQLContext
from pyspark.sql.types import *         # for defining schema with various datatypes
import pyspark.sql.functions as func    # for ETL, data processing on Dataframes

import pandas as pd                     # converting PysparkDF to PandasDF when passing it as a parameter to Bokeh invokes 

from datetime import *                  # for datetime datatype for schema
from dateutil.parser import parse       # for string parse to date

from bokeh.io import push_notebook, show, output_notebook  # various output methods for jupyter notebook
from bokeh.plotting import figure                          # creating a figure variable
from bokeh.charts import Bar, output_file, show            # creating bar charts, and displaying it
from bokeh.charts.attributes import cat                    # extracting column for 'label' category in bar charts
from bokeh.palettes import *                               # brewer color palette
from bokeh.models import Range1d                           # calibrating x/y-ranges of graph
output_notebook()

sc = SparkContext()
sql = SQLContext(sc)


Loading BokehJS ...

In [2]:
#Extracting and Transforming csv data

data_path = "../input/csv/"                                # path directory to input csv files
data_opath = "../output/csv/"                               # path directory to output csv files
match_rdd = sc.textFile(data_path + "matches.csv")         # reading csv files into RDD

# Issue: Applying this custom schema generates error while calling various PysparkDF functions. e.g. show()
# header = match_rdd.first()
# fields = [StructField(field_name, StringType(), False) for field_name in header.split(',')]
# fields[0].dataType = LongType()
# fields[3].dataType = DateType()
# fields[9].dataType = BooleanType()
# fields[11].dataType = LongType()
# fields[12].dataType = LongType()
# fields[17].nullable = True
# schema = StructType(fields)

match_header = match_rdd.filter(lambda l: "id,season" in l)     # storing the header tuple
match_no_header = match_rdd.subtract(match_header)              # subtracting it from RDD
match_temp_rdd = match_no_header.map(lambda k: k.split(','))\
.map(lambda p: (int(p[0]), int(p[1]),p[2],parse(p[3]).date(),p[4]\
                ,p[5],p[6],p[7],p[8],p[9]=='1',p[10],int(p[11])\
                ,int(p[12]),p[13],p[14],p[15],p[16],p[17]))     # Transforming csv file data

match_df = sql.createDataFrame(match_temp_rdd, match_rdd.first().split(','))  # converting to PysparkDF
match_df = match_df.orderBy(match_df.id.asc())                                # asc sort by id
match_df.show()


+---+------+----------+----------+--------------------+--------------------+--------------------+-------------+------+----------+--------------------+-----------+--------------+---------------+--------------------+-----------+--------------+--------------+
| id|season|      city|      date|               team1|               team2|         toss_winner|toss_decision|result|dl_applied|              winner|win_by_runs|win_by_wickets|player_of_match|               venue|    umpire1|       umpire2|       umpire3|
+---+------+----------+----------+--------------------+--------------------+--------------------+-------------+------+----------+--------------------+-----------+--------------+---------------+--------------------+-----------+--------------+--------------+
|  1|  2008| Bangalore|2008-04-18|Kolkata Knight Ri...|Royal Challengers...|Royal Challengers...|        field|normal|     false|Kolkata Knight Ri...|        140|             0|    BB McCullum|M Chinnaswamy Sta...|  Asad Rauf|   RE Koertzen|              |
|  2|  2008|Chandigarh|2008-04-19| Chennai Super Kings|     Kings XI Punjab| Chennai Super Kings|          bat|normal|     false| Chennai Super Kings|         33|             0|     MEK Hussey|"Punjab Cricket A...|    Mohali"|     MR Benson|    SL Shastri|
|  3|  2008|     Delhi|2008-04-19|    Rajasthan Royals|    Delhi Daredevils|    Rajasthan Royals|          bat|normal|     false|    Delhi Daredevils|          0|             9|    MF Maharoof|    Feroz Shah Kotla|  Aleem Dar|GA Pratapkumar|              |
|  4|  2008|    Mumbai|2008-04-20|      Mumbai Indians|Royal Challengers...|      Mumbai Indians|          bat|normal|     false|Royal Challengers...|          0|             5|     MV Boucher|    Wankhede Stadium|   SJ Davis|     DJ Harper|              |
|  5|  2008|   Kolkata|2008-04-20|     Deccan Chargers|Kolkata Knight Ri...|     Deccan Chargers|          bat|normal|     false|Kolkata Knight Ri...|          0|             5|      DJ Hussey|        Eden Gardens|  BF Bowden|   K Hariharan|              |
|  6|  2008|    Jaipur|2008-04-21|     Kings XI Punjab|    Rajasthan Royals|     Kings XI Punjab|          bat|normal|     false|    Rajasthan Royals|          0|             6|      SR Watson|Sawai Mansingh St...|  Aleem Dar|     RB Tiffin|              |
|  7|  2008| Hyderabad|2008-04-22|     Deccan Chargers|    Delhi Daredevils|     Deccan Chargers|          bat|normal|     false|    Delhi Daredevils|          0|             9|       V Sehwag|"Rajiv Gandhi Int...|     Uppal"|     IL Howell|     AM Saheba|
|  8|  2008|   Chennai|2008-04-23| Chennai Super Kings|      Mumbai Indians|      Mumbai Indians|        field|normal|     false| Chennai Super Kings|          6|             0|      ML Hayden|"MA Chidambaram S...|   Chepauk"|     DJ Harper|GA Pratapkumar|
|  9|  2008| Hyderabad|2008-04-24|     Deccan Chargers|    Rajasthan Royals|    Rajasthan Royals|        field|normal|     false|    Rajasthan Royals|          0|             3|      YK Pathan|"Rajiv Gandhi Int...|     Uppal"|     Asad Rauf|     MR Benson|
| 10|  2008|Chandigarh|2008-04-25|     Kings XI Punjab|      Mumbai Indians|      Mumbai Indians|        field|normal|     false|     Kings XI Punjab|         66|             0|  KC Sangakkara|"Punjab Cricket A...|    Mohali"|     Aleem Dar|     AM Saheba|
| 11|  2008| Bangalore|2008-04-26|Royal Challengers...|    Rajasthan Royals|    Rajasthan Royals|        field|normal|     false|    Rajasthan Royals|          0|             7|      SR Watson|M Chinnaswamy Sta...|  MR Benson|     IL Howell|              |
| 12|  2008|   Chennai|2008-04-26|Kolkata Knight Ri...| Chennai Super Kings|Kolkata Knight Ri...|          bat|normal|     false| Chennai Super Kings|          0|             9|       JDP Oram|"MA Chidambaram S...|   Chepauk"|     BF Bowden|AV Jayaprakash|
| 13|  2008|    Mumbai|2008-04-27|      Mumbai Indians|     Deccan Chargers|     Deccan Chargers|        field|normal|     false|     Deccan Chargers|          0|            10|   AC Gilchrist|Dr DY Patil Sport...|  Asad Rauf|    SL Shastri|              |
| 14|  2008|Chandigarh|2008-04-27|    Delhi Daredevils|     Kings XI Punjab|    Delhi Daredevils|          bat|normal|     false|     Kings XI Punjab|          0|             4|      SM Katich|"Punjab Cricket A...|    Mohali"|   RE Koertzen|     I Shivram|
| 15|  2008| Bangalore|2008-04-28| Chennai Super Kings|Royal Challengers...| Chennai Super Kings|          bat|normal|     false| Chennai Super Kings|         13|             0|       MS Dhoni|M Chinnaswamy Sta...|BR Doctrove|     RB Tiffin|              |
| 16|  2008|   Kolkata|2008-04-29|Kolkata Knight Ri...|      Mumbai Indians|Kolkata Knight Ri...|          bat|normal|     false|      Mumbai Indians|          0|             7|  ST Jayasuriya|        Eden Gardens|  BF Bowden|AV Jayaprakash|              |
| 17|  2008|     Delhi|2008-04-30|    Delhi Daredevils|Royal Challengers...|Royal Challengers...|        field|normal|     false|    Delhi Daredevils|         10|             0|     GD McGrath|    Feroz Shah Kotla|  Aleem Dar|     I Shivram|              |
| 18|  2008| Hyderabad|2008-05-01|     Deccan Chargers|     Kings XI Punjab|     Kings XI Punjab|        field|normal|     false|     Kings XI Punjab|          0|             7|       SE Marsh|"Rajiv Gandhi Int...|     Uppal"|   BR Doctrove|     RB Tiffin|
| 19|  2008|    Jaipur|2008-05-01|    Rajasthan Royals|Kolkata Knight Ri...|    Rajasthan Royals|          bat|normal|     false|    Rajasthan Royals|         45|             0|    SA Asnodkar|Sawai Mansingh St...|RE Koertzen|GA Pratapkumar|              |
| 20|  2008|   Chennai|2008-05-02| Chennai Super Kings|    Delhi Daredevils| Chennai Super Kings|          bat|normal|     false|    Delhi Daredevils|          0|             8|       V Sehwag|"MA Chidambaram S...|   Chepauk"|     BF Bowden|   K Hariharan|
+---+------+----------+----------+--------------------+--------------------+--------------------+-------------+------+----------+--------------------+-----------+--------------+---------------+--------------------+-----------+--------------+--------------+
only showing top 20 rows


In [3]:
def get_color_list(paletteName,numRows):
    return all_palettes[paletteName][numRows]

In [4]:
# Overall ranking module

def get_overall_ranks_df(season_num):
    overall_ranking = match_df.filter(match_df.season == season_num)\
                    .groupBy("winner").count().orderBy("count",ascending=0) # extracting required columns into another DF
        
    overall_ranking = overall_ranking.filter("winner != '' ")                          # Deleting records of tied matches
    overall_ranking = overall_ranking.selectExpr("winner as Teams", "count as Wins")   # Renaming columns
    return overall_ranking
    
    
def overall_rank_func(season_num):
    overall_ranking = get_overall_ranks_df(season_num)
    overall_ranking.show(truncate=False)

    overall_pdf = overall_ranking.toPandas()                 # Converting to PandaDF
    clr = get_color_list('Viridis',overall_ranking.count())  # Brewing color hex values for each tuple('team')

    figure_overall_ranking = Bar(overall_pdf, values="Wins", color="Teams",palette=clr,\
            label=cat(columns="Teams", sort=False), xgrid=True,\
            xlabel="Teams", ylabel="Wins", title="Overall Standings " + str(season_num),\
            legend='top_right', plot_width=950, bar_width=0.6)   # generating bar chart
    
    handle_overall_ranking = show(figure_overall_ranking, notebook_handle=True)
                                                                 # displaying the chart

In [5]:
#Performance consistency module

def get_consistency_DF(season_lbound, season_ubound):    
    consistency_df = match_df.select("season","winner")\
    .groupBy("season","winner").count().orderBy("winner")                  # extracting required columns 
    consistency_df = consistency_df.filter("winner!='' ")                  # filtering out tied matches records
    cond1 = func.col("season") >= season_lbound 
    cond2 = func.col("season") <= season_ubound
    consistency_df = consistency_df.filter(cond1 & cond2) 
    return consistency_df


def get_constraints():
    # constraint : teams that haven't played more than three season aren't considered
    constraint_df = match_df.groupBy("winner","season")\
                .count().orderBy("winner")                                 # extracting list of season-wise winner teams
 
    constraint_df = constraint_df.groupBy("winner").count()\
                .filter("count>3 and winner!='' ")                       # filtering out teams that don't satisfy constraint
    return constraint_df


def filter_using_constraints(consistency_df, constraint_list):
    consistency_df = consistency_df.where(func.col("winner")\
                    .isin(constraint_list))                                # applying the constraint list
    
    consistency_df = consistency_df.groupBy("winner")\
                    .agg(func.stddev_pop("count").alias("stddev"),\
                    func.sum("count").alias("total_wins"))\
                    .orderBy("stddev","total_wins")                        # calculating the performance consistency
    return consistency_df


def calc_consistency(consistency_df):
    consistency_df = consistency_df.withColumn("final_deviations",\
                    ((10-consistency_df.stddev)/10)*100)\
                    .orderBy("final_deviations", ascending=False)          # scaling to appropriate scale
        
    consistency_df = consistency_df.selectExpr("winner as Teams", "final_deviations as Consistency")
    
    return consistency_df


def consistency_func(season_lbound = 2008, season_ubound = 2016):
    consistency_df = get_consistency_DF(season_lbound, season_ubound)      # extracting required columns 
    constraints_df = get_constraints()
    constraints_list = [i.winner for i in constraints_df.collect()]          # storing a list of filtered teams
    consistency_df = filter_using_constraints(consistency_df, constraints_list)
    consistency_df = calc_consistency(consistency_df)
    consistency_df.show(truncate=False)
    
    consistency_pdf = consistency_df.toPandas()                    # converting to PandasDF
    clr= get_color_list("RdYlGn", consistency_df.count())          # brewing colors hex values for each team
    
    figure_consistency = Bar(consistency_pdf, values="Consistency",\
             color="Teams", palette=clr,\
             label=cat(columns="Teams", sort=False),\
             xlabel="Teams", ylabel="Win Consistency %age",\
             title="IPL Performance Consistencies",\
             legend='top_right', plot_width=950, bar_width=0.6)          # generating bar chart
    
    figure_consistency.y_range = Range1d(60,100)                         # setting appropriate ranges
    handle_consistency = show(figure_consistency, notebook_handle=True)  # displaying chart

In [6]:
overall_rank_func(2013)           # call this function by providing season year as arg
consistency_func(2009, 2014)      # function to call consistency module


+---------------------------+----+
|Teams                      |Wins|
+---------------------------+----+
|Mumbai Indians             |13  |
|Chennai Super Kings        |12  |
|Rajasthan Royals           |11  |
|Sunrisers Hyderabad        |10  |
|Royal Challengers Bangalore|9   |
|Kings XI Punjab            |8   |
|Kolkata Knight Riders      |6   |
|Pune Warriors              |4   |
|Delhi Daredevils           |3   |
+---------------------------+----+

+---------------------------+-----------------+
|Teams                      |Consistency      |
+---------------------------+-----------------+
|Chennai Super Kings        |87.09005551264195|
|Royal Challengers Bangalore|84.276698113239  |
|Rajasthan Royals           |82.28309031210891|
|Deccan Chargers            |80.79713563032848|
|Sunrisers Hyderabad        |80.0             |
|Kings XI Punjab            |76.42977396044841|
|Mumbai Indians             |73.7533070866273 |
|Kolkata Knight Riders      |69.76940475463824|
|Delhi Daredevils           |65.64078645318617|
+---------------------------+-----------------+