Analysis of NHL 15 Online Versus Data

Notebook setup


In [3]:
## import statements
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline


/usr/lib/python3.4/site-packages/matplotlib/backends/backend_gtk3agg.py:18: UserWarning: The Gtk3Agg backend is known to not work on Python 3.x with pycairo. Try installing cairocffi.
  "The Gtk3Agg backend is known to not work on Python 3.x with pycairo. "

In [5]:
## set up plotting aesthetics
sns.set(rc={'axes.facecolor'   : '#202020', 
            'axes.labelcolor'  : '#e0e0e0',
            'axes.edgecolor'   : '#e0e0e0',
            'axes.grid'        :  False,
            'text.color'       : '#e0e0e0', 
            'figure.facecolor' : '#202020',
            'figure.edgecolor' : '#202020',
            'xtick.color'      : '#e0e0e0',
            'ytick.color'      : '#e0e0e0',
            'legend.fancybox'  : 'true'
           })

octal_seq = [sns.hls_palette(12, 0, .5, .5)[i] for i in [0,1,2,4,7,9]]
octal_cat = [sns.hls_palette(12, 0, .5, .5)[i] for i in [7,1,4,0,9,2]]
octal_unity = [sns.hls_palette(12, 0, .5, .5)[i] for i in [4]]

#sns.palplot(octal_seq)
#sns.palplot(octal_cat)

Custom function definitions


In [6]:
# returns a colormap for an rgb color fading to full transparency
def fade_map(r=255,g=255,b=255,max_a=1,N=16):
    
    N = 16
    r_list = r*np.ones(N)/255
    g_list = g*np.ones(N)/255
    b_list = b*np.ones(N)/255
    a_list = np.linspace(0, max_a, N)
    C = np.transpose([r_list,g_list,b_list,a_list])
    
    return mpl.colors.ListedColormap(C)


# simulate continuity in an integers by adding random value between +/- 1/2
def jitter_int(x):
    
    # rand decimal on (-0.5, +0.5)
    j = np.random.random_sample()-0.5
    
    return (float(x) + j)


# perform kde plot for different factor values
def factorkde(stat, factor, data, smooth=False, **kwargs):
    # create buffer not to change existing data
    temp_df = data
    
    # smooth kde by adding jitter
    if smooth:
        temp_df[stat] = temp_df[stat].apply(jitter_int)
    
    # get list of all possible factors
    factor_list = list(set(temp_df[factor].values))
    
    # separate distributions for each factor
    dists = [(temp_df.loc[data[factor]==f, stat], f) for f in factor_list]
    
    # plot kde's for each factor
    for dist in dists:
        sns.kdeplot(dist[0], label=dist[1], **kwargs)
        
        
def plot_counts(factor,
                data, 
                num_to_plot = -1, 
                **kwargs):
    
    factor_list = data[factor].value_counts().index.values
    
    df = pd.DataFrame({factor  : factor_list, 
                       'count' : data[factor].value_counts().values})

    total = df['count'].sum()
    df['pct'] = 100*df['count']/total

    if num_to_plot == -1:
        num_to_plot = len(factor_list)

    return sns.factorplot(x    = factor, 
                          y    = 'pct',
                          data =  df[:num_to_plot],
                          kind = 'bar',
                          ci   =  0,
                          estimator = sum,
                          **kwargs)

        
def get_record(df, small=0):
    
    wins = df['result'].value_counts()['win']
    losses = df['result'].value_counts()['loss']
    total = wins+losses
    prob = wins / total
    err = np.sqrt(prob*(1-prob) / (total+small))
    
    return(wins, losses, prob, err)


def record_str(df, small=0):
    w, l, p, e = get_record(df, small)
    p *= 100
    e *= 100
    return('{0}-{1} ({2:.0f} ± {3:.0f}%)'.format(w, l, p, e))

Data munging


In [9]:
# read data
df = pd.read_csv('./data/nhl15_gamelog.csv')
#df.info()

In [10]:
## transform data

df['count'] = 1

# get stat differentials
df['g_dif'] = df.gf - df.ga
df['s_dif'] = df.sf - df.sa
df['toa_dif'] = df.toaf - df.toaa
df['hit_dif'] = df.hitf - df.hita

df['lvl_dif'] = df.lvl - df.opplvl
df['opp_skill_quantile'] = pd.qcut(df.opplvl, 5, labels=['Lowest', 'Lower', 'Average', 'Higher', 'Highest']) 

# get stat percentages
df['pct_g'] = 100 * df.gf / (df.gf + df.ga)
df['pct_s'] = 100 * df.sf / (df.sf + df.sa)
df['pct_toa'] = 100 * df.toaf / (df.toaf + df.toaa)
df['pct_offense'] = (df.pct_s + df.pct_toa)/2

# categorize wins vs losses
df['result'] = df['g_dif'].apply(lambda l: 'win' if l > 0 else 'loss')
df['win'] = df['g_dif'].apply(lambda l: 1 if l > 0 else 0)
df['more_shots'] = df['s_dif'].apply(lambda l: 1 if l > 0 else 0)
df['more_toa'] = df['toa_dif'].apply(lambda l: 1 if l > 0 else 0)

df['quadrant'] = 10 * df.more_toa + df.more_shots

df['quadrant'] = df['quadrant'].apply(lambda l: 
                                      'both'  if l == 11 
                                      else
                                      'poss' if l == 10
                                      else
                                      'shots'  if l == 1
                                      else
                                      'neither'
                                     )
df['previous_result'] = df['result'].shift(1)
df['previous_result2'] = df['result'].shift(2)
df['previous_result3'] = df['result'].shift(3)

Visualization

Posession and Shot Advantages


In [12]:
sns.lmplot(x        = 's_dif', 
           x_jitter =  0.5,
           y        = 'toa_dif', 
           hue      = 'result', 
           fit_reg  =  False, 
           data     =  df.sort('g_dif', ascending=False),
           palette  =  octal_cat,
           aspect   =  1)


Out[12]:
<seaborn.axisgrid.FacetGrid at 0x7f774d3f6198>

This gives a basic overview of the raw stat distributions in wins and losses. It is clear the majority of wins (blue) occur in the first quadrant, where there is an advantage in both shots and time of posession.

We can take this a step further and use a color gradient for goal differential instead of just wins/losses. This should highlight big wins and losses compared to close decisions.


In [17]:
temp_df = df.rename(columns={'g_dif': 'Goal Differential'}) 
grid = sns.lmplot(x        = 's_dif', 
                 x_jitter =  0.5,
                 y        = 'toa_dif', 
                 hue      = 'Goal Differential', 
                 fit_reg  =  False,
                 data     =  temp_df.sort('Goal Differential', ascending=False),
                 palette  =  sns.diverging_palette(240, 30, l=50, s=90, n=12, sep=1, center='dark'),
                 scatter_kws={"s": 40}
                 )
grid.set_axis_labels('Shot Difference', 'Attack Time Difference (min)')
plt.title('Stat Advantage Heatmap')
#fig.legend('Goal Differential')


Out[17]:
<matplotlib.text.Text at 0x7f774d515160>

A similar plot can be produced using the percentage of shots and posession instead of the absolute differences


In [21]:
temp_df = df.rename(columns={'g_dif': 'Goal Differential'}) 
grid = sns.lmplot(x        = 'pct_s', 
                 x_jitter =  0.5,
                 y        = 'pct_toa', 
                 hue      = 'Goal Differential', 
                 fit_reg  =  False,
                 data     =  temp_df.sort('Goal Differential', ascending=False),
                 palette  =  sns.diverging_palette(240, 30, l=50, s=90, n=12, sep=1, center='dark'),
                 scatter_kws={"s": 40}
                 )
grid.set_axis_labels('Percent of Shots', 'Percent of Attack Time')
plt.title('Stat Advantage Heatmap')
plt.xlim(0,100)
plt.ylim(0,100)
#fig.legend('Goal Differential')


Out[21]:
(0, 100)

Here, the effect of shot and posession advantages are quite a bit clearer. Another way of viewing these distributions would be a plot of their kernel density estimates. This gives a gaussian-like distribution of results, and the custom 'factorkde' function allows two separate curves to be overlayed representing wins and losses.


In [23]:
sns.set_palette(octal_cat)
factorkde(stat='s_dif', factor='result', data=df.sort('result'), smooth=True, kernel='gau')
plt.title('Distribution of Shot Differentials by Result')
plt.xlabel('Shot Differential')
plt.ylabel('Fraction of Games')


Out[23]:
<matplotlib.text.Text at 0x7f774ce4e2b0>

In [24]:
sns.set_palette(octal_cat)
factorkde(stat='toa_dif', factor='result', data=df.sort('result'), smooth=True, kernel='gau')
plt.title('Distribution of Attack Time Differentials by Result')
plt.xlabel('Attack Time Differential')
plt.ylabel('Fraction of Games')


Out[24]:
<matplotlib.text.Text at 0x7f774cdf98d0>

2D KDE plots can be used to visualize the distribution of both shots and TOA simultaneously. This gives a heatmap of sorts which indicate the common stat differentials in wins and losses.


In [26]:
fig, ax = plt.subplots()
sns.kdeplot(df['s_dif'],
            df['toa_dif'],
            cmap=fade_map(max_a=.05),
            shade=True,
            ax=ax)

sns.kdeplot(df[df.result=='win']['s_dif'], 
            df[df.result=='win']['toa_dif'],
            cmap=fade_map(64,128,192,1),
            shade=False,
            ax=ax,
            label='Wins')

sns.kdeplot(df[df.result=='loss']['s_dif'],
            df[df.result=='loss']['toa_dif'],
            cmap=fade_map(192,128,64,1),
            shade=False,
            ax=ax,
            label='Losses')
plt.title('Distribution of Shot and TOA Differentials by Result')
plt.xlabel('Shot Differential')
plt.ylabel('Posession Time Differential (min)')


Out[26]:
<matplotlib.text.Text at 0x7f774ccd0cf8>

Lets take a look at the raw numbers for different scenarios. Currently, my overall record in logged games is:


In [27]:
print(record_str(df, 0))


115-92 (56 ± 3%)

And in games with shot and posession advantages:


In [28]:
print('Shot Advantage:     ', record_str(df[df.s_dif>0]))
print('Posession Advantage:', record_str(df[df.toa_dif>0]))


Shot Advantage:      71-41 (63 ± 5%)
Posession Advantage: 74-51 (59 ± 4%)

So it would seem that shots are a slightly better predictor of the result of the game, but it isn't statistically significant. The shot advantage does suggest in increase in the odds of winning over baseline.

What happens if we control both variables at the same time? (i.e. Shot but not Posession advantage)


In [29]:
print('Shot and Posession Advantage:', record_str(df.loc[df.s_dif>0].loc[df.toa_dif>0], 1))
print('Shot Advantage Only:         ', record_str(df.loc[df.s_dif>0].loc[df.toa_dif<0], 1))
print('Posession Advantage Only:    ', record_str(df.loc[df.s_dif<0].loc[df.toa_dif>0], 1))
print('Neither Advantage:           ', record_str(df.loc[df.s_dif<0].loc[df.toa_dif<0], 1))


Shot and Posession Advantage: 53-32 (62 ± 5%)
Shot Advantage Only:          18-9 (67 ± 9%)
Posession Advantage Only:     15-17 (47 ± 9%)
Neither Advantage:            21-30 (41 ± 7%)

From these results it definitely appears that a shot advantage is much more important than a posession time advantage. This would suggest that shooting from the outside and looking for rebounds fares better than playing keep-away and trying to set up high percentage shots.

Can you get on a hot streak?


In [30]:
print('Overall:        ', record_str(df, 0))
print('Coming off win: ', record_str(df.loc[df.previous_result=='win'], 1))
print('Coming off loss:', record_str(df.loc[df.previous_result=='loss'], 1))


Overall:         115-92 (56 ± 3%)
Coming off win:  65-50 (57 ± 5%)
Coming off loss: 49-42 (54 ± 5%)

In [31]:
print('Coming off 2 wins:  ', record_str(df.loc[df.previous_result=='win'].loc[df.previous_result2=='win'], 1))
print('Coming off 2 losses:', record_str(df.loc[df.previous_result=='loss'].loc[df.previous_result2=='loss'], 1))


Coming off 2 wins:   36-29 (55 ± 6%)
Coming off 2 losses: 26-15 (63 ± 7%)

In [32]:
print('Coming off 3 wins:  ', record_str(df.loc[df.previous_result=='win'].loc[df.previous_result2=='win'].loc[df.previous_result3=='win'], 1))
print('Coming off 3 losses:', record_str(df.loc[df.previous_result=='loss'].loc[df.previous_result2=='loss'].loc[df.previous_result3=='loss'], 1))


Coming off 3 wins:   20-16 (56 ± 8%)
Coming off 3 losses: 7-8 (47 ± 12%)

I find this very surprising. I definitely feel like my odds of winning are higher after a few consecutive wins, but that doesn't appear to be the case. This is result agrees with the famous 'hot hand fallacy'.

Opponent's Skill Level

Using the game's built in ranking system, we can look at the results as a function of the opponent's skill


In [58]:
grid = sns.barplot(x         = 'opp_skill_quantile', 
                   y         = 'win',
                   data      =  df.sort('opplvl'),
                   estimator =  np.mean,
                   ci        =  80,
                   palette   =  octal_seq)


There's definitely a large degree of variability, but against lesser opponents the probability of winning is significantly higher.


In [34]:
print('Against Equal or Higher Rank:', record_str(df.loc[df.lvl_dif<=0], 1))
print('Against Lower Rank:          ', record_str(df.loc[df.lvl_dif>0], 1))


Against Equal or Higher Rank: 32-33 (49 ± 6%)
Against Lower Rank:           42-19 (69 ± 6%)

So what do the stats look like as the oppent skill varies?


In [55]:
sns.barplot(x         = 'opp_skill_quantile', 
            y         = 's_dif',
            data      =  df.sort('opplvl'),
            estimator =  np.mean,
            ci        =  80,
            palette   =  octal_seq)
plt.xlabel('Opponent Skill')
plt.ylabel('Shot Differential')


Out[55]:
<matplotlib.text.Text at 0x7f7746a74470>

In [56]:
sns.barplot(x         = 'opp_skill_quantile', 
            y         = 'toa_dif',
            data      =  df.sort('opplvl'),
            estimator =  np.mean,
            ci        =  80,
            palette   =  octal_seq)

plt.xlabel('Opponent Skill')
plt.ylabel('Attack Time Differential')


Out[56]:
<matplotlib.text.Text at 0x7f77469ef9e8>

Other than the two extremes, the stats look pretty balanced in most games. Unsurprisingly, the stat advantages are in my favor against weaker opponents, and vice-versa for stronger ones.

Most Common Teams Faced


In [57]:
grid = plot_counts('oppteam', df, 6, aspect=2.5, palette=octal_seq)
grid.set_axis_labels('Team Faced', 'Percentage of Games Played')


Out[57]:
<seaborn.axisgrid.FacetGrid at 0x7f774695c8d0>

In [62]:
print('Against Chicago:    ', record_str(df[df.oppteam=='CHI']))
print('Against New York:   ', record_str(df[df.oppteam=='NYR']))
print('Against Montreal:   ', record_str(df[df.oppteam=='MTL']))
print('Against Pittsburgh: ', record_str(df[df.oppteam=='PIT']))
print('Against Boston:     ', record_str(df[df.oppteam=='BOS']))
print('Against Washington: ', record_str(df[df.oppteam=='WSH']))


Against Chicago:     14-14 (50 ± 9%)
Against New York:    9-8 (53 ± 12%)
Against Montreal:    9-4 (69 ± 13%)
Against Pittsburgh:  7-4 (64 ± 15%)
Against Boston:      5-3 (62 ± 17%)
Against Washington:  5-2 (71 ± 17%)

In [63]:
from IPython.core.display import HTML
styles = open('./themes/custom.css', 'r').read()
HTML(styles)


Out[63]:

In [ ]: