Overall Standings module produces chart results for a particular season on the basis of #wins. Season is required as an input.
Performance Consistency module tracks a team's performance improvement consistency across all the seasons. Teams that have played for less than four season are not considered.
In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import * # for defining schema with various datatypes
import pyspark.sql.functions as func # for ETL, data processing on Dataframes
import pandas as pd # converting PysparkDF to PandasDF when passing it as a parameter to Bokeh invokes
from datetime import * # for datetime datatype for schema
from dateutil.parser import parse # for string parse to date
from bokeh.io import push_notebook, show, output_notebook # various output methods for jupyter notebook
from bokeh.plotting import figure # creating a figure variable
from bokeh.charts import Bar, output_file, show # creating bar charts, and displaying it
from bokeh.charts.attributes import cat # extracting column for 'label' category in bar charts
from bokeh.palettes import * # brewer color palette
from bokeh.models import Range1d # calibrating x/y-ranges of graph
output_notebook()
sc = SparkContext()
sql = SQLContext(sc)
In [2]:
#Extracting and Transforming csv data
data_path = "../input/csv/" # path directory to input csv files
data_opath = "../output/csv/" # path directory to output csv files
match_rdd = sc.textFile(data_path + "matches.csv") # reading csv files into RDD
# Issue: Applying this custom schema generates error while calling various PysparkDF functions. e.g. show()
# header = match_rdd.first()
# fields = [StructField(field_name, StringType(), False) for field_name in header.split(',')]
# fields[0].dataType = LongType()
# fields[3].dataType = DateType()
# fields[9].dataType = BooleanType()
# fields[11].dataType = LongType()
# fields[12].dataType = LongType()
# fields[17].nullable = True
# schema = StructType(fields)
match_header = match_rdd.filter(lambda l: "id,season" in l) # storing the header tuple
match_no_header = match_rdd.subtract(match_header) # subtracting it from RDD
match_temp_rdd = match_no_header.map(lambda k: k.split(','))\
.map(lambda p: (int(p[0]), int(p[1]),p[2],parse(p[3]).date(),p[4]\
,p[5],p[6],p[7],p[8],p[9]=='1',p[10],int(p[11])\
,int(p[12]),p[13],p[14],p[15],p[16],p[17])) # Transforming csv file data
match_df = sql.createDataFrame(match_temp_rdd, match_rdd.first().split(',')) # converting to PysparkDF
match_df = match_df.orderBy(match_df.id.asc()) # asc sort by id
match_df.show()
In [3]:
def get_color_list(paletteName,numRows):
return all_palettes[paletteName][numRows]
In [4]:
# Overall ranking module
def get_overall_ranks_df(season_num):
overall_ranking = match_df.filter(match_df.season == season_num)\
.groupBy("winner").count().orderBy("count",ascending=0) # extracting required columns into another DF
overall_ranking = overall_ranking.filter("winner != '' ") # Deleting records of tied matches
overall_ranking = overall_ranking.selectExpr("winner as Teams", "count as Wins") # Renaming columns
return overall_ranking
def overall_rank_func(season_num):
overall_ranking = get_overall_ranks_df(season_num)
overall_ranking.show(truncate=False)
overall_pdf = overall_ranking.toPandas() # Converting to PandaDF
clr = get_color_list('Viridis',overall_ranking.count()) # Brewing color hex values for each tuple('team')
figure_overall_ranking = Bar(overall_pdf, values="Wins", color="Teams",palette=clr,\
label=cat(columns="Teams", sort=False), xgrid=True,\
xlabel="Teams", ylabel="Wins", title="Overall Standings " + str(season_num),\
legend='top_right', plot_width=950, bar_width=0.6) # generating bar chart
handle_overall_ranking = show(figure_overall_ranking, notebook_handle=True)
# displaying the chart
In [5]:
#Performance consistency module
def get_consistency_DF(season_lbound, season_ubound):
consistency_df = match_df.select("season","winner")\
.groupBy("season","winner").count().orderBy("winner") # extracting required columns
consistency_df = consistency_df.filter("winner!='' ") # filtering out tied matches records
cond1 = func.col("season") >= season_lbound
cond2 = func.col("season") <= season_ubound
consistency_df = consistency_df.filter(cond1 & cond2)
return consistency_df
def get_constraints():
# constraint : teams that haven't played more than three season aren't considered
constraint_df = match_df.groupBy("winner","season")\
.count().orderBy("winner") # extracting list of season-wise winner teams
constraint_df = constraint_df.groupBy("winner").count()\
.filter("count>3 and winner!='' ") # filtering out teams that don't satisfy constraint
return constraint_df
def filter_using_constraints(consistency_df, constraint_list):
consistency_df = consistency_df.where(func.col("winner")\
.isin(constraint_list)) # applying the constraint list
consistency_df = consistency_df.groupBy("winner")\
.agg(func.stddev_pop("count").alias("stddev"),\
func.sum("count").alias("total_wins"))\
.orderBy("stddev","total_wins") # calculating the performance consistency
return consistency_df
def calc_consistency(consistency_df):
consistency_df = consistency_df.withColumn("final_deviations",\
((10-consistency_df.stddev)/10)*100)\
.orderBy("final_deviations", ascending=False) # scaling to appropriate scale
consistency_df = consistency_df.selectExpr("winner as Teams", "final_deviations as Consistency")
return consistency_df
def consistency_func(season_lbound = 2008, season_ubound = 2016):
consistency_df = get_consistency_DF(season_lbound, season_ubound) # extracting required columns
constraints_df = get_constraints()
constraints_list = [i.winner for i in constraints_df.collect()] # storing a list of filtered teams
consistency_df = filter_using_constraints(consistency_df, constraints_list)
consistency_df = calc_consistency(consistency_df)
consistency_df.show(truncate=False)
consistency_pdf = consistency_df.toPandas() # converting to PandasDF
clr= get_color_list("RdYlGn", consistency_df.count()) # brewing colors hex values for each team
figure_consistency = Bar(consistency_pdf, values="Consistency",\
color="Teams", palette=clr,\
label=cat(columns="Teams", sort=False),\
xlabel="Teams", ylabel="Win Consistency %age",\
title="IPL Performance Consistencies",\
legend='top_right', plot_width=950, bar_width=0.6) # generating bar chart
figure_consistency.y_range = Range1d(60,100) # setting appropriate ranges
handle_consistency = show(figure_consistency, notebook_handle=True) # displaying chart
In [6]:
overall_rank_func(2013) # call this function by providing season year as arg
consistency_func(2009, 2014) # function to call consistency module