In [1]:
import pandas as pd
from pylab import *
%matplotlib inline
import matplotlib.colors as mplc
from scrapenhl2.scrape import team_info, teams, players, autoupdate
from scrapenhl2.manipulate import manipulate as manip, add_onice_players as onice
# autoupdate.autoupdate() # uncomment this to update data
The purpose of this script is to generate shot counts for skaters after faceoffs.
For example, CA after 5 and 10 seconds for Nicklas Backstrom after defensive-zone faceoff wins.
In [2]:
team = team_info.team_as_id('WSH')
season = 2017
pbp = teams.get_team_pbp(season, team)
toi = teams.get_team_toi(season, team)
This is how we'll approach the problem:
In [3]:
# Filter to 5v5
toi = manip.filter_for_five_on_five(toi) \
[['Game', 'Team1', 'Team2', 'Team3', 'Team4', 'Team5', 'Time']] \
.drop_duplicates() # sometimes these appear
toi = toi.melt(id_vars=['Time', 'Game'], value_name='PlayerID') \
.drop('variable', axis=1)
toi.head()
Out[3]:
In [4]:
# Get faceoffs
draws = manip.filter_for_event_types(pbp, 'Faceoff')
# Select only needed columns
draws = draws[['Game', 'Team', 'Period', 'MinSec', 'X', 'Y']]
# Convert period and mm:ss to time elapsed in game
draws = onice.add_times_to_file(draws, periodcol='Period', timecol='MinSec', time_format='elapsed')
draws = draws.drop({'Period', 'MinSec'}, axis=1).assign(Season=season)
# Get zones
directions = manip.get_directions_for_xy_for_season(season, team)
draws = manip.infer_zones_for_faceoffs(draws, directions, 'X', 'Y', '_Secs', focus_team='WSH')
draws = draws.drop({'X', 'Y'}, axis=1)
# Simplify zone notation to just N, O, D
draws.loc[:, 'Zone'] = draws.EventLoc.str.slice(0, 1)
draws = draws.drop('EventLoc', axis=1)
# Combine with team column to change N, D, O, to NW, NL, DW, DL, OW, OL
draws.loc[:, 'WL'] = draws.Team.apply(lambda x: 'W' if x == team else 'L')
draws.loc[:, 'ZS'] = draws.Zone + draws.WL
draws = draws.drop({'WL', 'Zone', 'Team'}, axis=1)
draws.head()
Out[4]:
In [5]:
# Join to TOI
draws_joined = draws.rename(columns={'_Secs': 'Time'}) \
.merge(toi, how='right', on=['Game', 'Time'])
# Add last faceoff indicator and fill forward
draws_joined = draws_joined.sort_values(['Game', 'PlayerID', 'Time'])
draws_joined.loc[pd.notnull(draws_joined.ZS), 'LastDraw'] = draws_joined.Time
draws_joined.loc[:, 'LastDraw'] = draws_joined[['Game', 'PlayerID', 'LastDraw']] \
.groupby(['Game', 'PlayerID']).ffill()
draws_joined.loc[:, 'ZS'] = draws_joined[['Game', 'PlayerID', 'ZS']] \
.groupby(['Game', 'PlayerID']).ffill()
draws_joined.loc[:, 'TimeSinceLastDraw'] = draws_joined.Time - draws_joined.LastDraw
draws_joined = draws_joined[pd.notnull(draws_joined.TimeSinceLastDraw)] \
.query("TimeSinceLastDraw > 0")
draws_joined.head()
Out[5]:
In [6]:
# Get shot attempts
cfca = manip.filter_for_corsi(pbp)
# Select only needed columns
cfca = cfca[['Game', 'Team', 'Period', 'MinSec']]
# Convert period and mm:ss to time elapsed in game
cfca = onice.add_times_to_file(cfca, periodcol='Period', timecol='MinSec', time_format='elapsed')
cfca = cfca.drop({'Period', 'MinSec'}, axis=1).rename(columns={'_Secs': 'Time'})
# Add on-ice players
cfca = cfca.merge(toi, how='left', on=['Game', 'Time'])
# Change Team to CF or CA
cfca.loc[:, 'Team'] = cfca.Team.apply(lambda x: 'CF' if x == team else 'CA')
cfca.head()
Out[6]:
In [7]:
# Join to faceoffs df
joined = draws_joined.merge(cfca, how='left', on=['Game', 'Time', 'PlayerID'])
# Get counts of time after each draw
time_counts = joined[['ZS', 'PlayerID', 'TimeSinceLastDraw']] \
.assign(TOI=1) \
.groupby(['ZS', 'PlayerID', 'TimeSinceLastDraw'], as_index=False) \
.count()
# Get counts of shots
shot_counts = joined[['ZS', 'PlayerID', 'TimeSinceLastDraw', 'Team']] \
.dropna() \
.assign(Count=1) \
.groupby(['ZS', 'PlayerID', 'TimeSinceLastDraw', 'Team'], as_index=False) \
.count() \
.pivot_table(index=['ZS', 'PlayerID', 'TimeSinceLastDraw'], columns='Team', values='Count') \
.reset_index()
shot_counts.loc[:, 'CA'] = shot_counts.CA.fillna(0)
shot_counts.loc[:, 'CF'] = shot_counts.CF.fillna(0)
alljoined = time_counts \
.merge(shot_counts, how='left', on=['ZS', 'PlayerID', 'TimeSinceLastDraw']) \
.fillna(0)
alljoined.head()
Out[7]:
Finally, we can replace Player IDs with names, export, and graph.
In [8]:
alljoined.loc[:, 'Player'] = players.playerlst_as_str(alljoined.PlayerID)
alljoined = alljoined.drop('PlayerID', axis=1)
alljoined.loc[:, 'CF60'] = alljoined.CF * 3600 / alljoined.TOI
alljoined.loc[:, 'CA60'] = alljoined.CA * 3600 / alljoined.TOI
alljoined.to_csv('time_since_last_draw_data.csv', index=False)
alljoined.head()
Out[8]:
In [9]:
comp_players = ('Nicklas Backstrom', 'Evgeny Kuznetsov', 'Lars Eller', 'Jay Beagle')
def plot_cumulative_shot_lines(df, zone, metric, ax, *comp_players):
for p in comp_players:
df1 = df.query('ZS == "{0:s}" & Player == "{1:s}"'.format(zone, p)).sort_values('TimeSinceLastDraw')
df1.loc[:, 'TOI'] = df1.TOI.cumsum()
df1.loc[:, metric] = df1[metric].cumsum()
df1.loc[:, '{0:s}60'.format(metric)] = df1[metric] * 3600 / df1.TOI
ax.plot(df1.TimeSinceLastDraw, df1['{0:s}60'.format(metric)], label=p)
ax.set_title('Cumulative {0:s}60 after {1:s}'.format(metric, zone))
def plot_comparison(df, metric, *comp_players):
fig, axes = subplots(2, 3, sharex=True, sharey=True, figsize=[12, 8])
axes = axes.flatten()
df2 = df.query('TimeSinceLastDraw <= 15')
for i, zone in enumerate(['OW', 'OL', 'NW', 'NL', 'DW', 'DL']):
if metric == 'N':
plot_cumulative_ns(df2, zone, axes[i], *comp_players)
else:
plot_cumulative_shot_lines(df2, zone, metric, axes[i], *comp_players)
legend(loc=1)
plot_comparison(alljoined, 'CF', *comp_players)
In [10]:
plot_comparison(alljoined, 'CA', *comp_players)
In [11]:
def plot_cumulative_ns(df, zone, ax, *comp_players):
for p in comp_players:
df1 = df.query('ZS == "{0:s}" & Player == "{1:s}"'.format(zone, p)).sort_values('TimeSinceLastDraw')
ax.plot(df1.TimeSinceLastDraw, df1.TOI, label=p)
ax.set_title('Post-faceoff time in {0:s}'.format(zone))
plot_comparison(alljoined, 'N', *comp_players)
It also might be interesting to take a look at what fraction of a player's 5v5 TOI is accounted for by the time after faceoffs.
In [12]:
# Find shift starts
shifts = toi.sort_values(['Game', 'PlayerID', 'Time'])
# If shift yields diff time in same game and player, then it's a shift start
shifts.loc[:, 'PrevT'] = shifts.Time.shift(1)
shifts.loc[:, 'PrevP'] = shifts.PlayerID.shift(1)
shifts.loc[:, 'PrevG'] = shifts.Game.shift(1)
shifts.loc[(shifts.PlayerID == shifts.PrevP) & (shifts.Game == shifts.PrevG) & (shifts.Time != shifts.PrevT + 1),
'ShiftIndex'] = 1
shifts.loc[:, 'ShiftIndex'] = shifts.ShiftIndex.fillna(0)
shifts.loc[:, 'ShiftIndex'] = shifts.ShiftIndex.cumsum()
shifts = shifts.drop({'PrevT', 'PrevP', 'PrevG'}, axis=1)
shifts.head()
Out[12]:
In [13]:
# Calculate amount of time at each point in shift
starttimes = shifts[['Time', 'ShiftIndex', 'PlayerID']] \
.groupby(['ShiftIndex', 'PlayerID'], as_index=False).min() \
.rename(columns={'Time': 'StartTime'})
shifts2 = shifts.merge(starttimes, how='left', on=['ShiftIndex', 'PlayerID'])
shifts2.loc[:, 'TimeSinceLastDraw'] = shifts2.Time - shifts2.StartTime + 1
counts = shifts2[['TimeSinceLastDraw', 'PlayerID']] \
.assign(TOI=1) \
.groupby(['TimeSinceLastDraw', 'PlayerID'], as_index=False) \
.count() \
.assign(ZS='Other')
counts.loc[:, 'Player'] = players.playerlst_as_str(counts.PlayerID)
counts = counts.drop('PlayerID', axis=1)
counts.head()
Out[13]:
In [14]:
# Add to original
alljoined2 = pd.concat([alljoined[['Player', 'ZS', 'TimeSinceLastDraw', 'TOI']], counts])
alljoined2 = alljoined2.sort_values(['Player', 'ZS', 'TimeSinceLastDraw'])
# Convert to percentages
totals = alljoined2.drop('ZS', axis=1) \
.groupby(['Player', 'TimeSinceLastDraw'], as_index=False) \
.sum() \
.rename(columns={'TOI': 'TotalTOI'})
alljoined3 = alljoined2.merge(totals, how='left', on=['Player', 'TimeSinceLastDraw'])
alljoined3.loc[:, 'TOI'] = alljoined3.TOI / alljoined3.TotalTOI
alljoined3.head()
Out[14]:
In [15]:
def plot_stacked_area(df, p, ax, limit=20):
zones = ['OW', 'OL', 'NW', 'NL', 'DW', 'DL'][::-1] # for DZ at bottom
# Set colors
color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
colors = [mplc.to_rgba(color_cycle[0], alpha=0.5), color_cycle[0],
mplc.to_rgba(color_cycle[1], alpha=0.5), color_cycle[1],
mplc.to_rgba(color_cycle[2], alpha=0.5), color_cycle[2]]
struct = pd.DataFrame({'TimeSinceLastDraw': range(1, limit+1, 1)}).assign(Player=p)
struct = struct.merge(df[['Player', 'TimeSinceLastDraw', 'ZS', 'TOI']],
how='left', on=['Player', 'TimeSinceLastDraw'])
struct = struct.pivot_table(index=['Player', 'TimeSinceLastDraw'], columns='ZS', values='TOI').reset_index()
struct = struct.fillna(method='ffill')
ax.stackplot(struct.TimeSinceLastDraw, [struct[zone] for zone in zones], labels=zones, colors=colors)
ax.set_title('TOI into shift by 5v5 shift\nstart for {0:s}'.format(p))
ax.set_xlabel('Seconds into shift')
legend(loc=2)
def plot_tois(df, *comp_players):
fig, axes = subplots(1, 4, sharex=True, sharey=True, figsize=[12, 4])
axes = axes.flatten()
for i, p in enumerate(comp_players):
plot_stacked_area(df, p, axes[i])
legend(loc=2, bbox_to_anchor=(1, 1))
axes[0].set_ylabel('% of 5v5 shifts')
plot_tois(alljoined3, *comp_players)