In [3]:
## import statements
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
In [5]:
## set up plotting aesthetics
sns.set(rc={'axes.facecolor' : '#202020',
'axes.labelcolor' : '#e0e0e0',
'axes.edgecolor' : '#e0e0e0',
'axes.grid' : False,
'text.color' : '#e0e0e0',
'figure.facecolor' : '#202020',
'figure.edgecolor' : '#202020',
'xtick.color' : '#e0e0e0',
'ytick.color' : '#e0e0e0',
'legend.fancybox' : 'true'
})
octal_seq = [sns.hls_palette(12, 0, .5, .5)[i] for i in [0,1,2,4,7,9]]
octal_cat = [sns.hls_palette(12, 0, .5, .5)[i] for i in [7,1,4,0,9,2]]
octal_unity = [sns.hls_palette(12, 0, .5, .5)[i] for i in [4]]
#sns.palplot(octal_seq)
#sns.palplot(octal_cat)
In [6]:
# returns a colormap for an rgb color fading to full transparency
def fade_map(r=255,g=255,b=255,max_a=1,N=16):
N = 16
r_list = r*np.ones(N)/255
g_list = g*np.ones(N)/255
b_list = b*np.ones(N)/255
a_list = np.linspace(0, max_a, N)
C = np.transpose([r_list,g_list,b_list,a_list])
return mpl.colors.ListedColormap(C)
# simulate continuity in an integers by adding random value between +/- 1/2
def jitter_int(x):
# rand decimal on (-0.5, +0.5)
j = np.random.random_sample()-0.5
return (float(x) + j)
# perform kde plot for different factor values
def factorkde(stat, factor, data, smooth=False, **kwargs):
# create buffer not to change existing data
temp_df = data
# smooth kde by adding jitter
if smooth:
temp_df[stat] = temp_df[stat].apply(jitter_int)
# get list of all possible factors
factor_list = list(set(temp_df[factor].values))
# separate distributions for each factor
dists = [(temp_df.loc[data[factor]==f, stat], f) for f in factor_list]
# plot kde's for each factor
for dist in dists:
sns.kdeplot(dist[0], label=dist[1], **kwargs)
def plot_counts(factor,
data,
num_to_plot = -1,
**kwargs):
factor_list = data[factor].value_counts().index.values
df = pd.DataFrame({factor : factor_list,
'count' : data[factor].value_counts().values})
total = df['count'].sum()
df['pct'] = 100*df['count']/total
if num_to_plot == -1:
num_to_plot = len(factor_list)
return sns.factorplot(x = factor,
y = 'pct',
data = df[:num_to_plot],
kind = 'bar',
ci = 0,
estimator = sum,
**kwargs)
def get_record(df, small=0):
wins = df['result'].value_counts()['win']
losses = df['result'].value_counts()['loss']
total = wins+losses
prob = wins / total
err = np.sqrt(prob*(1-prob) / (total+small))
return(wins, losses, prob, err)
def record_str(df, small=0):
w, l, p, e = get_record(df, small)
p *= 100
e *= 100
return('{0}-{1} ({2:.0f} ± {3:.0f}%)'.format(w, l, p, e))
In [9]:
# read data
df = pd.read_csv('./data/nhl15_gamelog.csv')
#df.info()
In [10]:
## transform data
df['count'] = 1
# get stat differentials
df['g_dif'] = df.gf - df.ga
df['s_dif'] = df.sf - df.sa
df['toa_dif'] = df.toaf - df.toaa
df['hit_dif'] = df.hitf - df.hita
df['lvl_dif'] = df.lvl - df.opplvl
df['opp_skill_quantile'] = pd.qcut(df.opplvl, 5, labels=['Lowest', 'Lower', 'Average', 'Higher', 'Highest'])
# get stat percentages
df['pct_g'] = 100 * df.gf / (df.gf + df.ga)
df['pct_s'] = 100 * df.sf / (df.sf + df.sa)
df['pct_toa'] = 100 * df.toaf / (df.toaf + df.toaa)
df['pct_offense'] = (df.pct_s + df.pct_toa)/2
# categorize wins vs losses
df['result'] = df['g_dif'].apply(lambda l: 'win' if l > 0 else 'loss')
df['win'] = df['g_dif'].apply(lambda l: 1 if l > 0 else 0)
df['more_shots'] = df['s_dif'].apply(lambda l: 1 if l > 0 else 0)
df['more_toa'] = df['toa_dif'].apply(lambda l: 1 if l > 0 else 0)
df['quadrant'] = 10 * df.more_toa + df.more_shots
df['quadrant'] = df['quadrant'].apply(lambda l:
'both' if l == 11
else
'poss' if l == 10
else
'shots' if l == 1
else
'neither'
)
df['previous_result'] = df['result'].shift(1)
df['previous_result2'] = df['result'].shift(2)
df['previous_result3'] = df['result'].shift(3)
In [12]:
sns.lmplot(x = 's_dif',
x_jitter = 0.5,
y = 'toa_dif',
hue = 'result',
fit_reg = False,
data = df.sort('g_dif', ascending=False),
palette = octal_cat,
aspect = 1)
Out[12]:
This gives a basic overview of the raw stat distributions in wins and losses. It is clear the majority of wins (blue) occur in the first quadrant, where there is an advantage in both shots and time of posession.
We can take this a step further and use a color gradient for goal differential instead of just wins/losses. This should highlight big wins and losses compared to close decisions.
In [17]:
temp_df = df.rename(columns={'g_dif': 'Goal Differential'})
grid = sns.lmplot(x = 's_dif',
x_jitter = 0.5,
y = 'toa_dif',
hue = 'Goal Differential',
fit_reg = False,
data = temp_df.sort('Goal Differential', ascending=False),
palette = sns.diverging_palette(240, 30, l=50, s=90, n=12, sep=1, center='dark'),
scatter_kws={"s": 40}
)
grid.set_axis_labels('Shot Difference', 'Attack Time Difference (min)')
plt.title('Stat Advantage Heatmap')
#fig.legend('Goal Differential')
Out[17]:
A similar plot can be produced using the percentage of shots and posession instead of the absolute differences
In [21]:
temp_df = df.rename(columns={'g_dif': 'Goal Differential'})
grid = sns.lmplot(x = 'pct_s',
x_jitter = 0.5,
y = 'pct_toa',
hue = 'Goal Differential',
fit_reg = False,
data = temp_df.sort('Goal Differential', ascending=False),
palette = sns.diverging_palette(240, 30, l=50, s=90, n=12, sep=1, center='dark'),
scatter_kws={"s": 40}
)
grid.set_axis_labels('Percent of Shots', 'Percent of Attack Time')
plt.title('Stat Advantage Heatmap')
plt.xlim(0,100)
plt.ylim(0,100)
#fig.legend('Goal Differential')
Out[21]:
Here, the effect of shot and posession advantages are quite a bit clearer. Another way of viewing these distributions would be a plot of their kernel density estimates. This gives a gaussian-like distribution of results, and the custom 'factorkde' function allows two separate curves to be overlayed representing wins and losses.
In [23]:
sns.set_palette(octal_cat)
factorkde(stat='s_dif', factor='result', data=df.sort('result'), smooth=True, kernel='gau')
plt.title('Distribution of Shot Differentials by Result')
plt.xlabel('Shot Differential')
plt.ylabel('Fraction of Games')
Out[23]:
In [24]:
sns.set_palette(octal_cat)
factorkde(stat='toa_dif', factor='result', data=df.sort('result'), smooth=True, kernel='gau')
plt.title('Distribution of Attack Time Differentials by Result')
plt.xlabel('Attack Time Differential')
plt.ylabel('Fraction of Games')
Out[24]:
2D KDE plots can be used to visualize the distribution of both shots and TOA simultaneously. This gives a heatmap of sorts which indicate the common stat differentials in wins and losses.
In [26]:
fig, ax = plt.subplots()
sns.kdeplot(df['s_dif'],
df['toa_dif'],
cmap=fade_map(max_a=.05),
shade=True,
ax=ax)
sns.kdeplot(df[df.result=='win']['s_dif'],
df[df.result=='win']['toa_dif'],
cmap=fade_map(64,128,192,1),
shade=False,
ax=ax,
label='Wins')
sns.kdeplot(df[df.result=='loss']['s_dif'],
df[df.result=='loss']['toa_dif'],
cmap=fade_map(192,128,64,1),
shade=False,
ax=ax,
label='Losses')
plt.title('Distribution of Shot and TOA Differentials by Result')
plt.xlabel('Shot Differential')
plt.ylabel('Posession Time Differential (min)')
Out[26]:
Lets take a look at the raw numbers for different scenarios. Currently, my overall record in logged games is:
In [27]:
print(record_str(df, 0))
And in games with shot and posession advantages:
In [28]:
print('Shot Advantage: ', record_str(df[df.s_dif>0]))
print('Posession Advantage:', record_str(df[df.toa_dif>0]))
So it would seem that shots are a slightly better predictor of the result of the game, but it isn't statistically significant. The shot advantage does suggest in increase in the odds of winning over baseline.
What happens if we control both variables at the same time? (i.e. Shot but not Posession advantage)
In [29]:
print('Shot and Posession Advantage:', record_str(df.loc[df.s_dif>0].loc[df.toa_dif>0], 1))
print('Shot Advantage Only: ', record_str(df.loc[df.s_dif>0].loc[df.toa_dif<0], 1))
print('Posession Advantage Only: ', record_str(df.loc[df.s_dif<0].loc[df.toa_dif>0], 1))
print('Neither Advantage: ', record_str(df.loc[df.s_dif<0].loc[df.toa_dif<0], 1))
From these results it definitely appears that a shot advantage is much more important than a posession time advantage. This would suggest that shooting from the outside and looking for rebounds fares better than playing keep-away and trying to set up high percentage shots.
In [30]:
print('Overall: ', record_str(df, 0))
print('Coming off win: ', record_str(df.loc[df.previous_result=='win'], 1))
print('Coming off loss:', record_str(df.loc[df.previous_result=='loss'], 1))
In [31]:
print('Coming off 2 wins: ', record_str(df.loc[df.previous_result=='win'].loc[df.previous_result2=='win'], 1))
print('Coming off 2 losses:', record_str(df.loc[df.previous_result=='loss'].loc[df.previous_result2=='loss'], 1))
In [32]:
print('Coming off 3 wins: ', record_str(df.loc[df.previous_result=='win'].loc[df.previous_result2=='win'].loc[df.previous_result3=='win'], 1))
print('Coming off 3 losses:', record_str(df.loc[df.previous_result=='loss'].loc[df.previous_result2=='loss'].loc[df.previous_result3=='loss'], 1))
I find this very surprising. I definitely feel like my odds of winning are higher after a few consecutive wins, but that doesn't appear to be the case. This is result agrees with the famous 'hot hand fallacy'.
Using the game's built in ranking system, we can look at the results as a function of the opponent's skill
In [58]:
grid = sns.barplot(x = 'opp_skill_quantile',
y = 'win',
data = df.sort('opplvl'),
estimator = np.mean,
ci = 80,
palette = octal_seq)
There's definitely a large degree of variability, but against lesser opponents the probability of winning is significantly higher.
In [34]:
print('Against Equal or Higher Rank:', record_str(df.loc[df.lvl_dif<=0], 1))
print('Against Lower Rank: ', record_str(df.loc[df.lvl_dif>0], 1))
So what do the stats look like as the oppent skill varies?
In [55]:
sns.barplot(x = 'opp_skill_quantile',
y = 's_dif',
data = df.sort('opplvl'),
estimator = np.mean,
ci = 80,
palette = octal_seq)
plt.xlabel('Opponent Skill')
plt.ylabel('Shot Differential')
Out[55]:
In [56]:
sns.barplot(x = 'opp_skill_quantile',
y = 'toa_dif',
data = df.sort('opplvl'),
estimator = np.mean,
ci = 80,
palette = octal_seq)
plt.xlabel('Opponent Skill')
plt.ylabel('Attack Time Differential')
Out[56]:
Other than the two extremes, the stats look pretty balanced in most games. Unsurprisingly, the stat advantages are in my favor against weaker opponents, and vice-versa for stronger ones.
In [57]:
grid = plot_counts('oppteam', df, 6, aspect=2.5, palette=octal_seq)
grid.set_axis_labels('Team Faced', 'Percentage of Games Played')
Out[57]:
In [62]:
print('Against Chicago: ', record_str(df[df.oppteam=='CHI']))
print('Against New York: ', record_str(df[df.oppteam=='NYR']))
print('Against Montreal: ', record_str(df[df.oppteam=='MTL']))
print('Against Pittsburgh: ', record_str(df[df.oppteam=='PIT']))
print('Against Boston: ', record_str(df[df.oppteam=='BOS']))
print('Against Washington: ', record_str(df[df.oppteam=='WSH']))
In [63]:
from IPython.core.display import HTML
styles = open('./themes/custom.css', 'r').read()
HTML(styles)
Out[63]:
In [ ]: