In [1]:
import pandas as pd
from pylab import *
%matplotlib inline
import matplotlib.colors as mplc

from scrapenhl2.scrape import team_info, teams, players, autoupdate
from scrapenhl2.manipulate import manipulate as manip, add_onice_players as onice
# autoupdate.autoupdate()  # uncomment this to update data

The purpose of this script is to generate shot counts for skaters after faceoffs.

For example, CA after 5 and 10 seconds for Nicklas Backstrom after defensive-zone faceoff wins.


In [2]:
team = team_info.team_as_id('WSH')
season = 2017
pbp = teams.get_team_pbp(season, team)
toi = teams.get_team_toi(season, team)

This is how we'll approach the problem:

  • Filter TOI to 5v5 and go wide to long on players
  • Isolate faceoffs from the PBP
    • Get faceoff zones
    • Filter for 5v5
    • Get faceoff zones
  • Join faceoffs to TOI and code how many seconds after a draw that sec is
    • Join faceoffs, have an indicator of last faceoff
    • Fill forward on last faceoff time
    • Calculate difference between time and last faceoff time; filter for time <= 15
  • Isolate shots from the PBP
    • Join to TOI
    • Filter for 5v5
    • Join to first dataframe on game, player, and time
    • Group by player, time since draw, draw zone and type, and which team took the shot, and sum
  • Repeat for each team (not done here)

5v5 TOI


In [3]:
# Filter to 5v5
toi = manip.filter_for_five_on_five(toi) \
    [['Game', 'Team1', 'Team2', 'Team3', 'Team4', 'Team5', 'Time']] \
    .drop_duplicates()  # sometimes these appear
toi = toi.melt(id_vars=['Time', 'Game'], value_name='PlayerID') \
    .drop('variable', axis=1)
toi.head()


Out[3]:
Time Game PlayerID
0 0 20008 8475200.0
1 1 20008 8475200.0
2 2 20008 8475200.0
3 3 20008 8475200.0
4 4 20008 8475200.0

In [4]:
# Get faceoffs
draws = manip.filter_for_event_types(pbp, 'Faceoff')

# Select only needed columns
draws = draws[['Game', 'Team', 'Period', 'MinSec', 'X', 'Y']]

# Convert period and mm:ss to time elapsed in game
draws = onice.add_times_to_file(draws, periodcol='Period', timecol='MinSec', time_format='elapsed')
draws = draws.drop({'Period', 'MinSec'}, axis=1).assign(Season=season)

# Get zones
directions = manip.get_directions_for_xy_for_season(season, team)
draws = manip.infer_zones_for_faceoffs(draws, directions, 'X', 'Y', '_Secs', focus_team='WSH')
draws = draws.drop({'X', 'Y'}, axis=1)

# Simplify zone notation to just N, O, D
draws.loc[:, 'Zone'] = draws.EventLoc.str.slice(0, 1)
draws = draws.drop('EventLoc', axis=1)

# Combine with team column to change N, D, O, to NW, NL, DW, DL, OW, OL
draws.loc[:, 'WL'] = draws.Team.apply(lambda x: 'W' if x == team else 'L')
draws.loc[:, 'ZS'] = draws.Zone + draws.WL
draws = draws.drop({'WL', 'Zone', 'Team'}, axis=1)

draws.head()


Out[4]:
Game _Secs Season ZS
0 20008 1 2017 NL
1 20008 330 2017 OL
2 20008 368 2017 DW
3 20008 386 2017 NW
4 20008 441 2017 DL

In [5]:
# Join to TOI
draws_joined = draws.rename(columns={'_Secs': 'Time'}) \
    .merge(toi, how='right', on=['Game', 'Time'])
    
# Add last faceoff indicator and fill forward
draws_joined = draws_joined.sort_values(['Game', 'PlayerID', 'Time'])
draws_joined.loc[pd.notnull(draws_joined.ZS), 'LastDraw'] = draws_joined.Time
draws_joined.loc[:, 'LastDraw'] = draws_joined[['Game', 'PlayerID', 'LastDraw']] \
    .groupby(['Game', 'PlayerID']).ffill()
draws_joined.loc[:, 'ZS'] = draws_joined[['Game', 'PlayerID', 'ZS']] \
    .groupby(['Game', 'PlayerID']).ffill()

draws_joined.loc[:, 'TimeSinceLastDraw'] = draws_joined.Time - draws_joined.LastDraw

draws_joined = draws_joined[pd.notnull(draws_joined.TimeSinceLastDraw)] \
    .query("TimeSinceLastDraw > 0")

draws_joined.head()


Out[5]:
Game Time Season ZS PlayerID LastDraw TimeSinceLastDraw
9410 20008 369 NaN DW 8468498.0 368.0 1.0
9415 20008 370 NaN DW 8468498.0 368.0 2.0
9420 20008 371 NaN DW 8468498.0 368.0 3.0
9425 20008 372 NaN DW 8468498.0 368.0 4.0
9430 20008 373 NaN DW 8468498.0 368.0 5.0

Shot counts


In [6]:
# Get shot attempts
cfca = manip.filter_for_corsi(pbp)

# Select only needed columns
cfca = cfca[['Game', 'Team', 'Period', 'MinSec']]

# Convert period and mm:ss to time elapsed in game
cfca = onice.add_times_to_file(cfca, periodcol='Period', timecol='MinSec', time_format='elapsed')
cfca = cfca.drop({'Period', 'MinSec'}, axis=1).rename(columns={'_Secs': 'Time'})

# Add on-ice players
cfca = cfca.merge(toi, how='left', on=['Game', 'Time'])

# Change Team to CF or CA
cfca.loc[:, 'Team'] = cfca.Team.apply(lambda x: 'CF' if x == team else 'CA')

cfca.head()


Out[6]:
Game Team Time PlayerID
0 20008 CF 46 8476344.0
1 20008 CF 46 8475758.0
2 20008 CF 46 8474291.0
3 20008 CF 46 8468498.0
4 20008 CF 46 8474590.0

In [7]:
# Join to faceoffs df
joined = draws_joined.merge(cfca, how='left', on=['Game', 'Time', 'PlayerID'])

# Get counts of time after each draw
time_counts = joined[['ZS', 'PlayerID', 'TimeSinceLastDraw']] \
    .assign(TOI=1) \
    .groupby(['ZS', 'PlayerID', 'TimeSinceLastDraw'], as_index=False) \
    .count()

# Get counts of shots
shot_counts = joined[['ZS', 'PlayerID', 'TimeSinceLastDraw', 'Team']] \
    .dropna() \
    .assign(Count=1) \
    .groupby(['ZS', 'PlayerID', 'TimeSinceLastDraw', 'Team'], as_index=False) \
    .count() \
    .pivot_table(index=['ZS', 'PlayerID', 'TimeSinceLastDraw'], columns='Team', values='Count') \
    .reset_index()
shot_counts.loc[:, 'CA'] = shot_counts.CA.fillna(0)
shot_counts.loc[:, 'CF'] = shot_counts.CF.fillna(0)

alljoined = time_counts \
    .merge(shot_counts, how='left', on=['ZS', 'PlayerID', 'TimeSinceLastDraw']) \
    .fillna(0)

alljoined.head()


Out[7]:
ZS PlayerID TimeSinceLastDraw TOI CA CF
0 DL 8468498.0 1.0 87 0.0 0.0
1 DL 8468498.0 2.0 87 1.0 0.0
2 DL 8468498.0 3.0 85 3.0 0.0
3 DL 8468498.0 4.0 83 1.0 0.0
4 DL 8468498.0 5.0 84 7.0 0.0

Finally, we can replace Player IDs with names, export, and graph.


In [8]:
alljoined.loc[:, 'Player'] = players.playerlst_as_str(alljoined.PlayerID)
alljoined = alljoined.drop('PlayerID', axis=1)
alljoined.loc[:, 'CF60'] = alljoined.CF * 3600 / alljoined.TOI
alljoined.loc[:, 'CA60'] = alljoined.CA * 3600 / alljoined.TOI
alljoined.to_csv('time_since_last_draw_data.csv', index=False)
alljoined.head()


Out[8]:
ZS TimeSinceLastDraw TOI CA CF Player CF60 CA60
0 DL 1.0 87 0.0 0.0 Brooks Orpik 0.0 0.000000
1 DL 2.0 87 1.0 0.0 Brooks Orpik 0.0 41.379310
2 DL 3.0 85 3.0 0.0 Brooks Orpik 0.0 127.058824
3 DL 4.0 83 1.0 0.0 Brooks Orpik 0.0 43.373494
4 DL 5.0 84 7.0 0.0 Brooks Orpik 0.0 300.000000

In [9]:
comp_players = ('Nicklas Backstrom', 'Evgeny Kuznetsov', 'Lars Eller', 'Jay Beagle')

def plot_cumulative_shot_lines(df, zone, metric, ax, *comp_players):
    for p in comp_players:
        df1 = df.query('ZS == "{0:s}" & Player == "{1:s}"'.format(zone, p)).sort_values('TimeSinceLastDraw')
        df1.loc[:, 'TOI'] = df1.TOI.cumsum()
        df1.loc[:, metric] = df1[metric].cumsum()
        df1.loc[:, '{0:s}60'.format(metric)] = df1[metric] * 3600 / df1.TOI
        ax.plot(df1.TimeSinceLastDraw, df1['{0:s}60'.format(metric)], label=p)
    ax.set_title('Cumulative {0:s}60 after {1:s}'.format(metric, zone))

def plot_comparison(df, metric, *comp_players):
    fig, axes = subplots(2, 3, sharex=True, sharey=True, figsize=[12, 8])
    axes = axes.flatten()
    df2 = df.query('TimeSinceLastDraw <= 15')
    for i, zone in enumerate(['OW', 'OL', 'NW', 'NL', 'DW', 'DL']):
        if metric == 'N':
            plot_cumulative_ns(df2, zone, axes[i], *comp_players)
        else:
            plot_cumulative_shot_lines(df2, zone, metric, axes[i], *comp_players)
        legend(loc=1)
        
plot_comparison(alljoined, 'CF', *comp_players)


/Users/muneebalam/anaconda/lib/python3.6/site-packages/matplotlib/axes/_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

In [10]:
plot_comparison(alljoined, 'CA', *comp_players)


/Users/muneebalam/anaconda/lib/python3.6/site-packages/matplotlib/axes/_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

In [11]:
def plot_cumulative_ns(df, zone, ax, *comp_players):
    for p in comp_players:
        df1 = df.query('ZS == "{0:s}" & Player == "{1:s}"'.format(zone, p)).sort_values('TimeSinceLastDraw')
        ax.plot(df1.TimeSinceLastDraw, df1.TOI, label=p)
    ax.set_title('Post-faceoff time in {0:s}'.format(zone))
    
plot_comparison(alljoined, 'N', *comp_players)


/Users/muneebalam/anaconda/lib/python3.6/site-packages/matplotlib/axes/_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

It also might be interesting to take a look at what fraction of a player's 5v5 TOI is accounted for by the time after faceoffs.


In [12]:
# Find shift starts
shifts = toi.sort_values(['Game', 'PlayerID', 'Time'])

# If shift yields diff time in same game and player, then it's a shift start
shifts.loc[:, 'PrevT'] = shifts.Time.shift(1)
shifts.loc[:, 'PrevP'] = shifts.PlayerID.shift(1)
shifts.loc[:, 'PrevG'] = shifts.Game.shift(1)
shifts.loc[(shifts.PlayerID == shifts.PrevP) & (shifts.Game == shifts.PrevG) & (shifts.Time != shifts.PrevT + 1), 
           'ShiftIndex'] = 1
shifts.loc[:, 'ShiftIndex'] = shifts.ShiftIndex.fillna(0)
shifts.loc[:, 'ShiftIndex'] = shifts.ShiftIndex.cumsum()  

shifts = shifts.drop({'PrevT', 'PrevP', 'PrevG'}, axis=1)

shifts.head()


Out[12]:
Time Game PlayerID ShiftIndex
281384 41 20008 8468498.0 0.0
281385 42 20008 8468498.0 0.0
281386 43 20008 8468498.0 0.0
281387 44 20008 8468498.0 0.0
281388 45 20008 8468498.0 0.0

In [13]:
# Calculate amount of time at each point in shift
starttimes = shifts[['Time', 'ShiftIndex', 'PlayerID']] \
    .groupby(['ShiftIndex', 'PlayerID'], as_index=False).min() \
    .rename(columns={'Time': 'StartTime'})

shifts2 = shifts.merge(starttimes, how='left', on=['ShiftIndex', 'PlayerID'])
shifts2.loc[:, 'TimeSinceLastDraw'] = shifts2.Time - shifts2.StartTime + 1
counts = shifts2[['TimeSinceLastDraw', 'PlayerID']] \
    .assign(TOI=1) \
    .groupby(['TimeSinceLastDraw', 'PlayerID'], as_index=False) \
    .count() \
    .assign(ZS='Other')
    
counts.loc[:, 'Player'] = players.playerlst_as_str(counts.PlayerID)
counts = counts.drop('PlayerID', axis=1)
counts.head()


Out[13]:
TimeSinceLastDraw TOI ZS Player
0 1 725 Other Brooks Orpik
1 1 589 Other Alex Ovechkin
2 1 501 Other T.J. Oshie
3 1 452 Other Matt Niskanen
4 1 323 Other Taylor Chorney

In [14]:
# Add to original
alljoined2 = pd.concat([alljoined[['Player', 'ZS', 'TimeSinceLastDraw', 'TOI']], counts])
alljoined2 = alljoined2.sort_values(['Player', 'ZS', 'TimeSinceLastDraw'])

# Convert to percentages
totals = alljoined2.drop('ZS', axis=1) \
    .groupby(['Player', 'TimeSinceLastDraw'], as_index=False) \
    .sum() \
    .rename(columns={'TOI': 'TotalTOI'})
alljoined3 = alljoined2.merge(totals, how='left', on=['Player', 'TimeSinceLastDraw'])
alljoined3.loc[:, 'TOI'] = alljoined3.TOI / alljoined3.TotalTOI
alljoined3.head()


Out[14]:
Player TOI TimeSinceLastDraw ZS TotalTOI
0 Aaron Ness 0.051948 1.0 DL 231
1 Aaron Ness 0.052174 2.0 DL 230
2 Aaron Ness 0.052174 3.0 DL 230
3 Aaron Ness 0.052632 4.0 DL 228
4 Aaron Ness 0.052863 5.0 DL 227

In [15]:
def plot_stacked_area(df, p, ax, limit=20):
    zones = ['OW', 'OL', 'NW', 'NL', 'DW', 'DL'][::-1]  # for DZ at bottom
    
    # Set colors
    color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
    colors = [mplc.to_rgba(color_cycle[0], alpha=0.5), color_cycle[0], 
             mplc.to_rgba(color_cycle[1], alpha=0.5), color_cycle[1], 
             mplc.to_rgba(color_cycle[2], alpha=0.5), color_cycle[2]]
    
    
    struct = pd.DataFrame({'TimeSinceLastDraw': range(1, limit+1, 1)}).assign(Player=p)
    struct = struct.merge(df[['Player', 'TimeSinceLastDraw', 'ZS', 'TOI']], 
                          how='left', on=['Player', 'TimeSinceLastDraw'])
    struct = struct.pivot_table(index=['Player', 'TimeSinceLastDraw'], columns='ZS', values='TOI').reset_index()
    struct = struct.fillna(method='ffill')
    ax.stackplot(struct.TimeSinceLastDraw, [struct[zone] for zone in zones], labels=zones, colors=colors)
    ax.set_title('TOI into shift by 5v5 shift\nstart for {0:s}'.format(p))
    ax.set_xlabel('Seconds into shift')
    legend(loc=2)

def plot_tois(df, *comp_players):
    fig, axes = subplots(1, 4, sharex=True, sharey=True, figsize=[12, 4])
    axes = axes.flatten()
    for i, p in enumerate(comp_players):
        plot_stacked_area(df, p, axes[i])
    legend(loc=2, bbox_to_anchor=(1, 1))
    axes[0].set_ylabel('% of 5v5 shifts')

plot_tois(alljoined3, *comp_players)


/Users/muneebalam/anaconda/lib/python3.6/site-packages/matplotlib/axes/_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "