The purpose of this notebook is to calculate zone entry and exit-related data for tracked games, and generate TOI and shot-related data for the same games.
Here's what we'll do:
In [1]:
from os import listdir, chdir, getcwd
import pandas as pd
from pylab import *
from tqdm import tqdm # progress bar
%matplotlib inline
current_wd = getcwd()
In [2]:
# Want to combine all files
folders = ['/Users/muneebalam/Downloads/Game Reports 1718/',
#'/Users/muneebalam/Downloads/Passing Game Archive 1718/',
'/Users/muneebalam/Downloads/Game Reports 1617/']
sheets = {'shots': 'Shot Data', 'entries': 'Raw Entries', 'exits': 'Zone Exits Raw Data'}
copy = False
if copy:
for folder in folders:
chdir(folder)
files = listdir()
files = [f for f in files if f[-5:] == '.xlsx']
for file in tqdm(files, desc='Converting to csv'):
xl = pd.ExcelFile(file)
sheetnames = xl.sheet_names
for s in sheetnames:
df = xl.parse(s)
fout = '{0:s}_{1:s}.csv'.format(file[:-5], s)
df.to_csv(fout, index=False)
print('Done with', folder)
Now, let's check--do the sheets we're interested in (listed in cell above) contain the same columns?
In [3]:
colnames = {}
for skey, sval in sheets.items():
colnames[sval] = {}
for folder in folders:
chdir(folder)
files = listdir()
files = [f for f in files if f[f.rfind('_')+1:-4] == sval]
for file in tqdm(files, desc='Reading files'):
try:
cnames = pd.read_csv(file).columns
except Exception as e:
print(skey, sval, file, e, e.args)
continue
cnames = tuple(sorted(cnames))
if cnames not in colnames[sval]:
colnames[sval][cnames] = set()
colnames[sval][cnames].add('{0:s}-{1:s}'.format(folder[-5:-1], file[:5])) # Season and game number
print('Done with', skey, folder)
In [4]:
def intersect(*sets):
if len(sets) == 0:
return set()
if len(sets) == 1:
return sets[0]
if len(sets) == 2:
return set(sets[0]) & set(sets[1])
return set(sets[0]) & intersect(*tuple(sets[1:]))
for sval in colnames:
# Figure out column name frequency
colcount = {}
for clist in colnames[sval].keys():
for c in clist:
if c not in colcount:
colcount[c] = 0
colcount[c] += len(colnames[sval][clist])
colcount = [(k, v) for k, v in colcount.items()]
colcount = sorted(colcount, key = lambda x: x[1], reverse=True)
print(sval)
for k, v in colcount:
# Only ones with more than 200, for brevity's sake
if v >= 200:
print(k, v)
print('')
They look like they're in decent shape--there's a lot of overlap. I'll keep items with at least 515 occurrences.
In [5]:
cols_to_keep = {}
for sval in colnames:
# Figure out column name frequency
colcount = {}
for clist in colnames[sval].keys():
for c in clist:
if c not in colcount:
colcount[c] = 0
colcount[c] += len(colnames[sval][clist])
cols_to_keep[sval] = [k for k, v in colcount.items() if v >= 515]
print(cols_to_keep)
So now, we'll combine.
In [6]:
dfs = {k: [] for k in sheets.keys()}
generate = False
for skey, sval in sheets.items():
fout = skey + ' combined.csv'
if generate:
for folder in folders:
chdir(folder)
files = listdir()
files = [f for f in files if f[f.rfind('_')+1:-4] == sval]
for file in tqdm(files, desc='Reading files'):
try:
df = pd.read_csv(file)
# Exclude columns I don't want
cols = set(df.columns) - set(cols_to_keep[sval])
df = df.drop(cols, axis=1, errors='ignore')
df = df.assign(Season=2000 + int(folder[-5:-3]), Game=int(file[:5]))
dfs[skey].append(df)
except Exception as e:
print(skey, sval, file, e, e.args)
continue
print('Done with', skey, folder)
dfs[skey] = pd.concat(dfs[skey])
dfs[skey].to_csv(fout, index=False)
dfs[skey] = pd.read_csv(fout)
chdir(current_wd)
print('Done aggregating and reading files')
Here's what the dataframes look like:
In [7]:
dfs['shots'].head()
Out[7]:
In [8]:
dfs['entries'].head()
Out[8]:
In [9]:
dfs['exits'].head()
Out[9]:
In [10]:
np.random.seed(8) # Obv, for shots, pick Ovechkin
wsh_games = dfs['shots'].query('Team == "WSH"')[['Season', 'Game']].drop_duplicates().sort_values(['Season', 'Game'])
wsh_games.loc[:, 'RandomNum'] = np.random.randint(low=0, high=100, size=len(wsh_games))
wsh_games.loc[:, 'Selected'] = wsh_games.RandomNum.apply(lambda x: x <= 10)
wsh_games = wsh_games.query('Selected == True').drop('RandomNum', axis=1)
wsh_games.set_index(['Season', 'Game']) # Just for viewing purposes
Out[10]:
Here are the shot counts from the tracked data:
In [11]:
wsh_shots = dfs['shots'].merge(wsh_games[['Season', 'Game']], how='inner', on=['Season', 'Game']) \
.query('Strength == "5v5"')
wsh_shots.loc[:, 'Team'] = wsh_shots.Team.apply(lambda x: x if x == 'WSH' else 'Opp')
wsh_shots = wsh_shots[['Season', 'Game', 'Team']].assign(Count=1) \
.groupby(['Season', 'Game', 'Team'], as_index=False) \
.count()
wsh_shots = wsh_shots.pivot_table(index=['Season', 'Game'], columns='Team', values='Count')
wsh_shots
Out[11]:
Now, let's pull shot counts for those games from our scraped data.
In [12]:
from scrapenhl2.scrape import teams
df1 = teams.get_team_pbp(2016, 'WSH').assign(Season=2016)
df2 = teams.get_team_pbp(2017, 'WSH').assign(Season=2017)
df3 = pd.concat([df1, df2]).merge(wsh_games[['Season', 'Game']], how='inner', on=['Season', 'Game'])
# Go to 5v5 only
from scrapenhl2.manipulate import manipulate as manip
df3 = manip.filter_for_five_on_five(manip.filter_for_corsi(df3))
counts = df3[['Season', 'Game', 'Team']]
counts.loc[:, 'Team'] = counts.Team.apply(lambda x: 'WSH' if x == 15 else 'Opp')
counts = counts.assign(Count=1) \
.groupby(['Season', 'Game', 'Team'], as_index=False) \
.count()
counts = counts.pivot_table(index=['Season', 'Game'], columns='Team', values='Count')
counts
Out[12]:
The counts are pretty close, so we'll use the tracked data.
We still need TOI, though. I'll pull a second-by-second log into a dataframe.
In [13]:
from scrapenhl2.scrape import players, team_info
dfs['toi'] = {}
team_convert = {'LA': 'LAK', 'NJ': 'NJD', 'TB': 'TBL', 'SJ': 'SJS',
'L.A': 'LAK', 'N.J': 'NJD', 'T.B': 'TBL', 'S.J': 'SJS'}
for team in dfs['shots'].Team.unique():
if team in team_convert:
team = team_convert[team]
if not isinstance(team, str) or len(team) != 3:
#print('Skipping', team)
continue
toi = []
for season in range(2016, 2018):
try:
toi.append(teams.get_team_toi(season, team).assign(Season=season))
except Exception as e:
print('Could not read', team, e, e.args)
toi = pd.concat(toi)
# Filter for appropriate games using an inner join, and filter for 5v5
toi = toi.merge(dfs['shots'][['Season', 'Game']].drop_duplicates(),
how='inner', on=['Season', 'Game'])
toi = manip.filter_for_five_on_five(toi)
# Get only certain columns
toi = toi[['Season', 'Game', 'Time', 'Team1', 'Team2', 'Team3', 'Team4', 'Team5']]
renaming = {'Team{0:d}'.format(i): '{0:s}{1:d}'.format(team, i) for i in range(1, 6)}
toi = toi.rename(columns=renaming)
# Convert to player names
for col in toi.columns[3:]:
toi.loc[:, col] = toi[col].apply(lambda x: players.player_as_str(x))
dfs['toi'][team] = toi
dfs['toi']['WSH'].head()
Out[13]:
One final item: let's filter the entries, exits, and shots dataframes to 5v5 using this TOI data.
In [14]:
fives = {}
from scrapenhl2.manipulate import add_onice_players as onice
dfs['shots'].loc[:, 'Time2'] = dfs['shots'].Time.str.extract(r'(\d{1,2}:\d{1,2}):\d{1,2}$')
dfs['shots'] = onice.add_times_to_file(dfs['shots'].dropna(subset=['Time']),
periodcol='Period', timecol='Time2', time_format='remaining')
fives['shots'] = dfs['shots']
for team, toi in dfs['toi'].items():
toi = toi.rename(columns={'Time': '_Secs'})
fives['shots'] = fives['shots'].merge(toi, how='left', on=['Season', 'Game', '_Secs'])
fives['shots'].head()
Out[14]:
In [15]:
dfs['entries'].loc[:, 'Time2'] = dfs['entries'].Time.str.extract(r'(\d{1,2}:\d{1,2}):\d{1,2}$')
dfs['entries'] = onice.add_times_to_file(dfs['entries'].dropna(subset=['Time']),
periodcol='Period', timecol='Time2', time_format='remaining')
fives['entries'] = dfs['entries']
for team, toi in dfs['toi'].items():
toi = toi.rename(columns={'Time': '_Secs'})
fives['entries'] = fives['entries'].merge(toi, how='left', on=['Season', 'Game', '_Secs'])
fives['entries'].head()
Out[15]:
In [16]:
dfs['exits'].loc[:, 'Time2'] = dfs['exits'].Time.str.extract(r'(\d{1,2}:\d{1,2}):\d{1,2}$')
dfs['exits'] = onice.add_times_to_file(dfs['exits'].dropna(subset=['Time']),
periodcol='Period', timecol='Time2', time_format='remaining')
fives['exits'] = dfs['exits']
for team, toi in dfs['toi'].items():
toi = toi.rename(columns={'Time': '_Secs'})
fives['exits'] = fives['exits'].merge(toi, how='left', on=['Season', 'Game', '_Secs'])
fives['exits'].head()
Out[16]:
Let's look at games included by team.
In [17]:
teamcolnames = [x for x in fives['shots'].columns if x == x.upper() and len(x) == 4 and x[-1] == '1']
gps = []
for teamcol in teamcolnames:
gp = len(dfs['toi'][teamcol[:-1]][['Season', 'Game']].drop_duplicates())
print(teamcol[:3], gp)
gps.append(gp)
print('Total', sum(gps))
Let's reduce this to just Caps games now.
In [18]:
wsh = {key: fives[key].dropna(subset=['WSH1']) for key in ['shots', 'entries', 'exits']}
wsh['shots'].head()
Out[18]:
Let's convert numbers to names. I don't have a lookup table handy, so I'll do it by hand.
In [19]:
wsh['shots'][['WSH1', 'WSH2', 'WSH3', 'WSH4', 'WSH5']] \
.melt(var_name='P', value_name='Name') \
['Name'].value_counts()
Out[19]:
In [20]:
wsh_players = {'Alex Ovechkin': 8, 'Nicklas Backstrom': 19, 'Andre Burakovsky': 65,
'T.J. Oshie': 77, 'John Carlson': 74, 'Evgeny Kuznetsov': 92,
'Dmitry Orlov': 9, 'Christian Djoos': 29, 'Devante Smith-Pelly': 25,
'Jay Beagle': 83, 'Brooks Orpik': 44, 'Chandler Stephenson': 18,
'Jakub Vrana': 13, 'Tom Wilson': 43, 'Lars Eller': 20,
'Alex Chiasson': 39, 'Brett Connolly': 10, 'Madison Bowey': 22, 'Kevin Shattenkirk': 22,
'Tyler Graovac': 91, '': 36, 'Matt Niskanen': 2,
'Aaron Ness': 55, 'Nathan Walker': 79, 'Taylor Chorney': 4,
'Zach Sanford': 12, 'Karl Alzner': 27, 'Marcus Johansson': 90,
'Zachary Sanford': 12,
'Justin Williams': 14, 'Daniel Winnik': 26, 'Nate Schmidt': 88}
for col in ['WSH1', 'WSH2', 'WSH3', 'WSH4', 'WSH5']:
for key in ['shots', 'entries', 'exits']:
wsh[key].loc[:, col] = wsh[key][col].apply(lambda x: str(wsh_players[x]) + 'WSH' if x in wsh_players else x)
wsh['shots'].head()
Out[20]:
In [21]:
# Team comparison
# Drop extra team cols
allteamcols = [x for x in fives['entries'].columns if not x.upper() == x]
allteams = fives['entries'][allteamcols]
# Remove fails and faceoffs
allteams = allteams[pd.isnull(allteams.Fail)]
allteams = allteams[allteams['Entry type'] != 'FAC']
# Counts by game and team
allteams = allteams[['Season', 'Game', 'Entry by', 'Entry type']]
# Extract ending text part of entry.
import re
def extract_team(string):
result = re.search('\d*(\w{2,3})$', str(string))
if result:
return result.group(1)
return string
allteams.loc[:, 'Team'] = allteams['Entry by'].apply(lambda x: extract_team(x))
# Get rid of typo teams -- about 100 in 77000
valid_teams = set(allteams.Team.value_counts().index[:31])
allteams = allteams[allteams.Team.apply(lambda x: x in valid_teams)]
allteams = allteams.drop('Entry by', axis=1) \
.assign(Count=1) \
.groupby(['Season', 'Game', 'Team', 'Entry type'], as_index=False) \
.count()
# Add opp numbers
gametotals = allteams.drop('Team', axis=1) \
.groupby(['Season', 'Game', 'Entry type'], as_index=False) \
.sum() \
.rename(columns={'Count': 'OppCount'})
fives['toi'] = pd.concat([dfs['toi'][team] \
[['Season', 'Game']] \
.assign(TOI=1) \
.groupby(['Season', 'Game'], as_index=False) \
.count() for team in dfs['toi']]).drop_duplicates()
# Sum by season and calculate per 60
allteams = allteams.merge(fives['toi'], how='left', on=['Season', 'Game'])
allteams = allteams.merge(gametotals, how='inner', on=['Season', 'Game', 'Entry type']) \
.drop('Game', axis=1) \
.groupby(['Season', 'Entry type', 'Team'], as_index=False) \
.sum()
allteams.loc[:, 'OppCount'] = allteams['OppCount'] - allteams['Count']
allteams.loc[:, 'Per60'] = allteams['Count'] / (allteams.TOI / 3600)
allteams.loc[:, 'OppPer60'] = allteams['OppCount'] / (allteams.TOI / 3600)
allteams.head()
Out[21]:
In [22]:
f = figure(figsize=[8, 8])
tmp = allteams[allteams['Entry type'] != 'X'] \
.drop(['Count', 'OppCount', 'TOI'], axis=1) \
.pivot_table(index=['Entry type', 'Team'], columns='Season', values='Per60') \
.reset_index()
for etype in tmp['Entry type'].unique():
tmp2 = tmp[tmp['Entry type'] == etype]
scatter(tmp2.loc[:, 2016].values, tmp2.loc[:, 2017].values, label=etype, s=200, alpha=0.5)
for s, etype, t, r1, r2 in tmp.itertuples():
annotate(t, xy=(r1, r2), ha='center', va='center')
from scrapenhl2.plot import visualization_helper as vhelper
vhelper.add_good_bad_fast_slow(bottomleft='Bad', topleft='Improved', topright='Good', bottomright='Declined')
vhelper.add_cfpct_ref_lines_to_plot(ax=gca(), refs=[50])
title('Team 5v5 entry rate, 2016 vs 2017')
xlabel('2016')
ylabel('2017')
legend(loc=2, bbox_to_anchor=(1, 1))
Out[22]:
In [23]:
f = figure(figsize=[8, 8])
tmp = allteams[allteams['Entry type'] != 'X'] \
.drop(['Count', 'OppCount', 'TOI'], axis=1) \
.pivot_table(index=['Season', 'Team'], columns='Entry type', values='Per60') \
.reset_index()
tmp.loc[:, 'Ctrl%'] = tmp.C / (tmp.C + tmp.D)
tmp = tmp.drop(['C', 'D'], axis=1)
tmp = tmp.pivot_table(index='Team', columns='Season', values='Ctrl%').reset_index()
scatter(tmp.loc[:, 2016].values, tmp.loc[:, 2017].values, s=200, alpha=0.5)
for i, t, r1, r2 in tmp.itertuples():
annotate(t, xy=(r1, r2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='Bad', topleft='Improved', topright='Good', bottomright='Declined')
vhelper.add_cfpct_ref_lines_to_plot(ax=gca(), refs=[50])
title('Team 5v5 controlled entry rate, 2016 vs 2017')
xlabel('2016')
ylabel('2017')
Out[23]:
In [24]:
fig, axes = subplots(1, 2, sharex=True, sharey=True, figsize=[12, 6])
for i, season in enumerate(allteams.Season.unique()):
tmp = allteams[(allteams.Season == season) & (allteams['Entry type'] != 'X')]
for etype in tmp['Entry type'].unique():
tmp2 = tmp[tmp['Entry type'] == etype]
axes[i].scatter(tmp2.Per60.values, tmp2.OppPer60.values, s=250, alpha=0.3, label=etype)
axes[i].set_title('Entries for and against, {0:d}'.format(season))
axes[i].set_xlabel('Entries per 60')
if i == 0:
axes[i].set_ylabel('Entries against per 60')
for _, t, r1, r2 in tmp[['Team', 'Per60', 'OppPer60']].itertuples():
axes[i].annotate(t, xy=(r1, r2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='NZ Jam', topleft='Bad', topright='Racetrack', bottomright='Good')
vhelper.add_cfpct_ref_lines_to_plot(ax=axes[i])
legend(loc=2, bbox_to_anchor=(1, 1))
Out[24]:
In [25]:
tmp = allteams[allteams['Entry type'] != 'X'] \
.drop(['Count', 'OppCount', 'TOI'], axis=1) \
.melt(id_vars=['Season', 'Entry type', 'Team']) \
.pivot_table(index=['Season', 'Team', 'variable'], columns='Entry type', values='value') \
.reset_index()
tmp.loc[:, 'CE%'] = tmp.C / (tmp.C + tmp.D)
tmp = tmp.drop(['C', 'D'], axis=1) \
.pivot_table(index=['Season', 'Team'], columns='variable', values='CE%') \
.rename(columns={'Per60': 'TeamCE%', 'OppPer60': 'OppCE%'}) \
.reset_index()
fig, axes = subplots(1, 2, sharex=True, sharey=True, figsize=[12, 6])
for i, season in enumerate(tmp.Season.unique()):
tmp2 = tmp[(tmp.Season == season)]
axes[i].scatter(tmp2['TeamCE%'].values, tmp2['OppCE%'].values, s=250, alpha=0.3)
axes[i].set_title('Controlled entries for and against, {0:d}'.format(season))
axes[i].set_xlabel('Team CE%')
if i == 0:
axes[i].set_ylabel('Opp CE%')
for _, t, r1, r2 in tmp2[['Team', 'TeamCE%', 'OppCE%']].itertuples():
axes[i].annotate(t, xy=(r1, r2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='NZ Jam', topleft='Bad', topright='Racetrack', bottomright='Good')
vhelper.add_cfpct_ref_lines_to_plot(ax=axes[i])
legend(loc=2, bbox_to_anchor=(1, 1))
In [26]:
entries = wsh['entries']
# Drop extra team cols
colnames = [x for x in entries.columns if not x.upper() == x or x[:3] == 'WSH']
entries = entries[colnames]
# Remove fails
entries = entries[pd.isnull(entries.Fail)]
# Flag entries as WSH or Opp
entries.loc[:, 'Team'] = entries['Entry by'].apply(lambda x: 'WSH' if str(x)[-3:] == 'WSH' else 'Opp')
# Remove faceoffs
# entries2 = entries
entries2 = entries[entries['Entry type'] != 'FAC']
# Melt to long
idvars = [x for x in entries2.columns if x not in ['WSH1', 'WSH2', 'WSH3', 'WSH4', 'WSH5']]
entries2 = entries2.melt(id_vars=idvars, value_name='Player').sort_values(['Season', 'Game', '_Secs', 'variable'])
entries2.head()
Out[26]:
In [27]:
# Season level
# Count by entry type
entries60 = entries2[['Season', 'Entry type', 'Team', 'Game', 'Period', 'Time']] \
.drop_duplicates() \
[['Season', 'Entry type', 'Team']] \
.assign(Count=1) \
.groupby(['Season', 'Entry type', 'Team'], as_index=False) \
.count()
# Add TOI
toi = dfs['toi']['WSH'] \
[['Season']].assign(TOI=1) \
.groupby('Season', as_index=False).count()
entries60 = entries60.merge(toi, how='left', on='Season')
entries60.loc[:, 'Per60'] = entries60.Count / (entries60.TOI / 3600)
entries60
Out[27]:
In [28]:
tmp = entries60.assign(Height=entries60.Per60).sort_values(['Season', 'Team', 'Entry type'])
tmp.loc[:, 'Left'] = tmp.Team.apply(lambda x: 0 if x == 'WSH' else 1) + tmp.Season.apply(lambda x: 0 if x == 2016 else 0.4)
tmp.loc[:, 'Bottom'] = tmp.groupby(['Season', 'Team']).Height.cumsum() - tmp.Height
for etype in tmp['Entry type'].unique():
tmp2 = tmp[tmp['Entry type'] == etype]
bar(left=tmp2.Left.values, height=tmp2.Height.values, bottom=tmp2.Bottom.values, label=etype, width=0.3)
xlabs = tmp.drop_duplicates(subset=['Season', 'Team', 'Left'])
xticks([x for x in xlabs.Left], ['{0:d} {1:s}'.format(s, t) for s, t in zip(tmp2.Season, tmp2.Team)])
legend(loc=2, bbox_to_anchor=(1, 1))
title('Zone entries per 60, Caps vs opponents')
Out[28]:
In [29]:
# Individual level, on-ice
# Count by entry type
entries60 = entries2[['Season', 'Entry type', 'Team', 'Game', 'Period', 'Time', 'Player']] \
[['Season', 'Entry type', 'Team', 'Player']] \
.assign(Count=1) \
.groupby(['Season', 'Entry type', 'Team', 'Player'], as_index=False) \
.count() \
.pivot_table(index=['Season', 'Entry type', 'Player'], columns='Team', values='Count') \
.reset_index()
# Add TOI
toi = dfs['toi']['WSH'] \
[['Season', 'WSH1', 'WSH2', 'WSH3', 'WSH4', 'WSH5']] \
.melt(id_vars='Season', value_name='Player') \
.drop('variable', axis=1) \
.assign(TOI=1) \
.groupby(['Season', 'Player'], as_index=False).count()
toi.loc[:, 'Player'] = toi['Player'].apply(lambda x: str(wsh_players[x]) + 'WSH' if x in wsh_players else x)
entries60 = entries60.merge(toi, how='left', on=['Season', 'Player']) \
.sort_values(['Player', 'Season', 'Entry type'])
entries60.loc[:, 'WSH60'] = entries60.WSH / (entries60.TOI / 3600)
entries60.loc[:, 'Opp60'] = entries60.Opp / (entries60.TOI / 3600)
entries60
Out[29]:
In [30]:
title('Entry%, 2016 vs 2017')
xlabel('2016')
ylabel('2017')
tmp = entries60[['Season', 'Player', 'WSH', 'Opp']] \
.groupby(['Season', 'Player', 'WSH', 'Opp'], as_index=False) \
.sum()
tmp.loc[:, 'Entry%'] = tmp.WSH / (tmp.WSH + tmp.Opp)
tmp = tmp.drop(['WSH', 'Opp'], axis=1)
tmp = tmp.pivot_table(index='Player', columns='Season', values='Entry%')
scatter(tmp.loc[:, 2016].values, tmp.loc[:, 2017].values, s=200, alpha=0.5)
for p, e1, e2 in tmp.itertuples():
annotate(p[:-3], xy=(e1, e2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='Bad', topleft='Improved', topright='Good', bottomright='Declined')
vhelper.add_cfpct_ref_lines_to_plot(ax=gca(), refs=[50])
In [31]:
title('Entries per 60, 2016 vs 2017')
xlabel('2016')
ylabel('2017')
tmp = entries60[['Season', 'Player', 'WSH60']] \
.groupby(['Season', 'Player'], as_index=False) \
.sum()
tmp = tmp.pivot_table(index='Player', columns='Season', values='WSH60')
scatter(tmp.loc[:, 2016].values, tmp.loc[:, 2017].values, s=200, alpha=0.5)
for p, e1, e2 in tmp.itertuples():
annotate(p[:-3], xy=(e1, e2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='Bad', topleft='Improved', topright='Good', bottomright='Declined')
vhelper.add_cfpct_ref_lines_to_plot(ax=gca(), refs=[50])
In [32]:
title('Entries against per 60, 2016 vs 2017')
xlabel('2016')
ylabel('2017')
tmp = entries60[['Season', 'Player', 'Opp60']] \
.groupby(['Season', 'Player'], as_index=False) \
.sum()
tmp = tmp.pivot_table(index='Player', columns='Season', values='Opp60')
scatter(tmp.loc[:, 2016].values, tmp.loc[:, 2017].values, s=200, alpha=0.5)
for p, e1, e2 in tmp.itertuples():
annotate(p[:-3], xy=(e1, e2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='Good', topleft='Declined', topright='Bad', bottomright='Improved')
vhelper.add_cfpct_ref_lines_to_plot(ax=gca(), refs=[50])
ylim(bottom=xlim()[0])
xlim(right=ylim()[1])
Out[32]:
In [33]:
sums = entries60[['Season', 'Player', 'WSH', 'Opp']] \
.groupby(['Season', 'Player'], as_index=False) \
.sum()
cep = entries60[entries60['Entry type'] == 'C'] \
[['Season', 'Player', 'WSH', 'Opp']] \
.merge(sums, how='inner', on=['Season', 'Player'], suffixes=['', '_Tot'])
cep.loc[:, 'CE%'] = cep.WSH / (cep.WSH + cep.WSH_Tot)
cep.loc[:, 'Opp CE%'] = cep.Opp / (cep.Opp + cep.Opp_Tot)
ce = cep[['Season', 'Player', 'CE%']] \
.pivot_table(index='Player', columns='Season', values='CE%')
oppce = cep[['Season', 'Player', 'Opp CE%']] \
.pivot_table(index='Player', columns='Season', values='Opp CE%')
In [34]:
title('On-ice controlled entry%, 2016 vs 2017')
xlabel('2016')
ylabel('2017')
scatter(ce.loc[:, 2016].values, ce.loc[:, 2017].values, s=200, alpha=0.5)
for p, e1, e2 in ce.itertuples():
annotate(p[:-3], xy=(e1, e2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='Bad', topleft='Improved', topright='Good', bottomright='Declined')
vhelper.add_cfpct_ref_lines_to_plot(ax=gca(), refs=[50])
In [35]:
title('Opp on-ice controlled entry%, 2016 vs 2017')
xlabel('2016')
ylabel('2017')
scatter(oppce.loc[:, 2016].values, oppce.loc[:, 2017].values, s=200, alpha=0.5)
for p, e1, e2 in oppce.itertuples():
annotate(p[:-3], xy=(e1, e2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='Good', topleft='Declined', topright='Bad', bottomright='Improved')
vhelper.add_cfpct_ref_lines_to_plot(ax=gca(), refs=[50])
xlim(left=0.2)
Out[35]:
In [36]:
# Individual level, individual
# Count by entry type
ientries60 = entries2[pd.notnull(entries2['Entry by'])]
ientries60 = ientries60[ientries60['Entry by'].str.contains('WSH')] \
[['Season', 'Entry type', 'Team', 'Game', 'Period', 'Time', 'Entry by', 'Fen total']] \
.drop_duplicates() \
[['Season', 'Entry type', 'Team', 'Entry by', 'Fen total']] \
.assign(Count=1) \
.groupby(['Season', 'Entry type', 'Team', 'Entry by'], as_index=False) \
.count() \
.pivot_table(index=['Season', 'Entry type', 'Entry by'], columns='Team', values='Count') \
.reset_index()
# Add TOI
toi = dfs['toi']['WSH'] \
[['Season', 'WSH1', 'WSH2', 'WSH3', 'WSH4', 'WSH5']] \
.melt(id_vars='Season', value_name='Entry by') \
.drop('variable', axis=1) \
.assign(TOI=1) \
.groupby(['Season', 'Entry by'], as_index=False).count()
toi.loc[:, 'Entry by'] = toi['Entry by'].apply(lambda x: str(wsh_players[x]) + 'WSH' if x in wsh_players else x)
ientries60 = ientries60.merge(toi, how='left', on=['Season', 'Entry by']) \
.sort_values(['Entry by', 'Season', 'Entry type'])
ientries60.loc[:, 'WSH60'] = entries60.WSH / (entries60.TOI / 3600)
ientries60.drop(['TOI', 'WSH'], axis=1).pivot_table(index=['Entry by', 'Season'], columns='Entry type', values='WSH60')
Out[36]:
In [37]:
tmp = entries60[['Season', 'Entry type', 'Player', 'WSH']] \
.merge(ientries60[['Season', 'Entry type', 'Entry by', 'WSH']] \
.rename(columns={'Entry by': 'Player', 'WSH': 'iWSH'}),
how='left', on=['Season', 'Entry type', 'Player']) \
.drop('Entry type', axis=1) \
.groupby(['Season', 'Player'], as_index=False) \
.sum()
tmp.loc[:, 'iE%'] = tmp.iWSH / tmp.WSH
tmp = tmp.drop({'WSH', 'iWSH'}, axis=1) \
.pivot_table(index='Player', columns='Season', values='iE%') \
.fillna(0)
title('Individual entry share, 2016 vs 2017')
xlabel('2016')
ylabel('2017')
scatter(tmp.loc[:, 2016].values, tmp.loc[:, 2017].values, s=200, alpha=0.5)
for p, e1, e2 in tmp.itertuples():
annotate(p[:-3], xy=(e1, e2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='Bad', topleft='Improved', topright='Good', bottomright='Declined')
vhelper.add_cfpct_ref_lines_to_plot(ax=gca(), refs=[50])
In [38]:
# Season level
# Count by entry type
spe = entries2[['Season', 'Entry type', 'Team', 'Game', 'Period', 'Time', 'Fen total']] \
.drop_duplicates() \
[['Season', 'Entry type', 'Team', 'Fen total']] \
.groupby(['Season', 'Entry type', 'Team'], as_index=False) \
.mean() \
.pivot_table(index=['Season', 'Entry type'], columns='Team', values='Fen total') \
.reset_index() \
.sort_values(['Entry type', 'Season'])
spe
Out[38]:
In [39]:
# Individual level, on-ice
# Count by entry type
spe = entries2[['Season', 'Entry type', 'Team', 'Game', 'Period', 'Time', 'Player', 'Fen total']] \
[['Season', 'Entry type', 'Team', 'Player', 'Fen total']] \
.assign(Count=1) \
.groupby(['Season', 'Entry type', 'Team', 'Player'], as_index=False) \
.mean() \
.pivot_table(index=['Season', 'Entry type', 'Player'], columns='Team', values='Fen total') \
.reset_index() \
.sort_values(['Player', 'Season', 'Entry type'])
spe
Out[39]:
In [40]:
tmp = spe[spe['Entry type'] != 'X'] \
.drop('Opp', axis=1) \
.pivot_table(index=['Player', 'Entry type'], columns='Season', values='WSH') \
.fillna(0) \
.reset_index()
for etype in tmp['Entry type'].unique():
tmp2 = tmp[tmp['Entry type'] == etype]
scatter(tmp2.loc[:, 2016].values, tmp2.loc[:, 2017].values, label=etype, s=200, alpha=0.5)
for s, p, etype, e1, e2 in tmp.itertuples():
annotate(p[:-3], xy=(e1, e2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='Bad', topleft='Improved', topright='Good', bottomright='Declined')
vhelper.add_cfpct_ref_lines_to_plot(ax=gca(), refs=[50])
title('On-ice shots per entry')
xlabel('2016')
ylabel('2017')
legend(loc=2, bbox_to_anchor=(1, 1))
Out[40]:
In [41]:
tmp = spe[spe['Entry type'] != 'X'] \
.drop('WSH', axis=1) \
.pivot_table(index=['Player', 'Entry type'], columns='Season', values='Opp') \
.fillna(0) \
.reset_index()
for etype in tmp['Entry type'].unique():
tmp2 = tmp[tmp['Entry type'] == etype]
scatter(tmp2.loc[:, 2016].values, tmp2.loc[:, 2017].values, label=etype, s=200, alpha=0.5)
for s, p, etype, e1, e2 in tmp.itertuples():
annotate(p[:-3], xy=(e1, e2), ha='center', va='center')
vhelper.add_good_bad_fast_slow(bottomleft='Good', topleft='Declined', topright='Bad', bottomright='Improved')
vhelper.add_cfpct_ref_lines_to_plot(ax=gca(), refs=[50])
title('On-ice shots against per entry')
xlabel('2016')
ylabel('2017')
legend(loc=2, bbox_to_anchor=(1, 1))
Out[41]:
In [42]:
# Individual level, individual
# Count by entry type
spe = entries2[pd.notnull(entries2['Entry by'])]
spe = spe[spe['Entry by'].str.contains('WSH')] \
[['Season', 'Entry type', 'Team', 'Game', 'Period', 'Time', 'Entry by', 'Fen total']] \
.drop_duplicates() \
[['Season', 'Entry type', 'Team', 'Entry by', 'Fen total']] \
.assign(Count=1) \
.groupby(['Season', 'Entry type', 'Team', 'Entry by'], as_index=False) \
.mean() \
.pivot_table(index=['Season', 'Entry type', 'Entry by'], columns='Team', values='Fen total') \
.reset_index() \
.sort_values(['Entry by', 'Season', 'Entry type'])
spe.pivot_table(index=['Entry by', 'Season'], columns='Entry type', values='WSH')
Out[42]:
In [43]:
exits = wsh['exits']
# Drop extra team cols
colnames = [x for x in exits.columns if not x.upper() == x or x[:3] == 'WSH']
exits = exits[colnames]
# Flag exits as WSH or Opp
exits.loc[:, 'Team'] = exits['Attempt'].apply(lambda x: 'WSH' if str(x)[-3:] == 'WSH' else 'Opp')
# Melt to long
idvars = [x for x in exits.columns if x not in ['WSH1', 'WSH2', 'WSH3', 'WSH4', 'WSH5']]
exits2 = exits.melt(id_vars=idvars, value_name='Player').sort_values(['Season', 'Game', '_Secs', 'variable'])
exits2.head()
Out[43]:
In [44]:
# Season level
# Count by exit type
exits60 = exits2[['Season', 'Result', 'Team', 'Game', 'Period', 'Time']] \
.drop_duplicates() \
[['Season', 'Result', 'Team']] \
.assign(Count=1) \
.groupby(['Season', 'Result', 'Team'], as_index=False) \
.count() \
.pivot_table(index=['Season', 'Result'], columns='Team', values='Count') \
.reset_index() \
.sort_values(['Result', 'Season'])
# Add TOI
toi = dfs['toi']['WSH'] \
[['Season']].assign(TOI=1) \
.groupby('Season', as_index=False).count()
exits60 = exits60.merge(toi, how='left', on='Season')
exits60.loc[:, 'WSH60'] = exits60.WSH / (exits60.TOI / 3600)
exits60.loc[:, 'Opp60'] = exits60.Opp / (exits60.TOI / 3600)
exits60
Out[44]:
In [45]:
# Individual level, on-ice
# Count by exit type
exits60 = exits2[['Season', 'Result', 'Team', 'Game', 'Period', 'Time', 'Player']] \
[['Season', 'Result', 'Team', 'Player']] \
.assign(Count=1) \
.groupby(['Season', 'Result', 'Team', 'Player'], as_index=False) \
.count() \
.pivot_table(index=['Season', 'Result', 'Player'], columns='Team', values='Count') \
.reset_index() \
.sort_values(['Result', 'Season', 'Player'])
# Add TOI
toi = dfs['toi']['WSH'] \
[['Season', 'WSH1', 'WSH2', 'WSH3', 'WSH4', 'WSH5']] \
.melt(id_vars='Season', value_name='Player') \
.drop('variable', axis=1) \
.assign(TOI=1) \
.groupby(['Season', 'Player'], as_index=False).count()
toi.loc[:, 'Player'] = toi['Player'].apply(lambda x: str(wsh_players[x]) + 'WSH' if x in wsh_players else x)
exits60 = exits60.merge(toi, how='left', on=['Season', 'Player'])
exits60.loc[:, 'WSH60'] = exits60.WSH / (exits60.TOI / 3600)
exits60.loc[:, 'Opp60'] = exits60.Opp / (exits60.TOI / 3600)
exits60
Out[45]:
In [46]:
# Individual level, individual
# Count by exit type
exits60 = exits2[exits2.Attempt.str.contains('WSH')] \
.query('Game == 20502') \
[['Season', 'Result', 'Team', 'Game', 'Period', 'Time', 'Attempt']] \
.drop_duplicates() \
[['Season', 'Result', 'Team', 'Attempt']] \
.assign(Count=1) \
.groupby(['Season', 'Result', 'Team', 'Attempt'], as_index=False) \
.count() \
.pivot_table(index=['Season', 'Result', 'Attempt'], columns='Team', values='Count') \
.reset_index() \
.sort_values(['Result', 'Season', 'Attempt'])
# Add TOI
toi = dfs['toi']['WSH'] \
[['Season', 'WSH1', 'WSH2', 'WSH3', 'WSH4', 'WSH5']] \
.melt(id_vars='Season', value_name='Attempt') \
.drop('variable', axis=1) \
.assign(TOI=1) \
.groupby(['Season', 'Attempt'], as_index=False).count()
toi.loc[:, 'Attempt'] = toi['Attempt'].apply(lambda x: str(wsh_players[x]) + 'WSH' if x in wsh_players else x)
exits60 = exits60.merge(toi, how='left', on=['Season', 'Attempt'])
exits60.loc[:, 'WSH60'] = exits60.WSH / (exits60.TOI / 3600)
exits60.pivot_table(index=['Attempt', 'Season'], columns='Result', values='WSH60')
Out[46]: