Download, prepare, and wrangle data into a format suitable for analyzing player shot attempts. In particular, the analysis I have in mind is to do some analysis on shot selection and frequency - however, the final data frame output will be contain any shot attempt data suitable for general analysis based off of shots (rebounds, assists, etc)
The data will include, along with basic shot information:
One last note - I made the decision to not include technical free throws here, as I don't believe they're directly related to player shot frequency in terms of strategy/player behavior.
In [460]:
from bs4 import BeautifulSoup
import urllib2
import pandas as pd
import numpy as np
import re
In [2]:
test_url = 'http://www.basketball-reference.com/boxscores/pbp/201411230OKC.html'
In [62]:
# Output is a dataframe for play by play data
def play_by_play(url):
from bs4 import BeautifulSoup
import urllib2
import pandas as pd
import numpy as np
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
pbp = soup.find(id='page_content').find('table', \
attrs={'class': 'no_highlight stats_table'})
empty = u'\xa0'
quarter=0
play_by_play = []
for row in pbp.find_all('tr'):
tds = row.find_all('td')
# Get number of td to determine what the row is
if len(tds) == 6:
play_by_play.append( {
'quarter': quarter,
'time': tds[0].text,
'away_team_action': tds[1] if tds[1].text != empty else np.nan,
'away_team_score_change': tds[2].text,
'score': tds[3].text,
'home_team_score_change': tds[4].text,
'home_team_action': tds[5] if tds[5].text != empty else np.nan,
})
elif len(tds) == 1:
# Is a jump ball
# ignore for now
print "Jump ball! Ignoring for now"
elif len(row.find_all('th')) == 1:
# New quarter - add to quarter - crude check for now, come back to this
quarter += 1 if 'Quarter' in row.th.text else 0
return pd.DataFrame(play_by_play).replace(u'\xa0', np.nan)
In [412]:
df = play_by_play(test_url)
In [ ]:
In [413]:
# work on changing this to a more workable format
# Coalesce away/home team action to 2 columns (action text and home boolean)
# Note - keep action as html so we can work on it further
df['action_html'] = df['away_team_action'].fillna(df['home_team_action'])
df['is_home'] = pd.isnull(df['home_team_action']) == False
In [414]:
# Get gametime from the "time" column
# Do this by converting to a timedelta64 column, and then subtract from 12 min * (quarter-1)
# To convert to timedelta, string needs to be in format hh:mm:ss
# Prep the time column and do some string manipulation
df['time'] = df['time'].str.replace('\.0', '')
df.loc[ df[df['time'].str.len()==5].index, 'time'] = '00:' + df['time']
df.loc[ df[df['time'].str.len()==4].index, 'time'] = '00:0' + df['time']
df['time'] = df['time'].apply(lambda x: pd.Timedelta(x.encode('utf-8')))
df['gametime'] = (df['quarter']-1) * 12 * np.timedelta64(1, 'm') + \
(12*np.timedelta64(1, 'm') - df['time'])
In [415]:
# Split out score to home/away score columns
df['home_score'] = df['score'].str.replace('-.*', '')
df['away_score'] = df['score'].str.replace('.*-', '')
Now we want to split out/wrangle the data into a dataframe where each row represents a shot attempt. Ideally, the columns would be:
In the case of an and-1, shot attempt type/success will be 2/3 and true, and FTA/FTM will be populated Otherwise, FTA/FTM should only be populated where attempt type = fouled and success = false
One note - we're ignoring technical free throws here since the end goal of this munging is to do some analysis on player shot frequency
One last note - probably separately from this dataframe, would like to know when a player entered/exited the game to account for bench time. Will likely keep that separate
In [492]:
# First step - easy stuff - get out shot attept and player
# Note that free throw rows are still here, we'll do some shift
# logic later to make those columns
# We'll also apply shift logic to rebounds here,
# since that's a little easier than free throws
df['player'] = df['action_html'].apply(lambda x: x.a['href'] if x.a and \
'makes' in x.text or 'misses' in x.text \
else x.select('a:nth-of-type(2)')[0]['href'] if \
x.select('a:nth-of-type(2)') and \
'drawn by' in x.text.lower() and \
'shooting foul' in x.text.lower() \
else np.nan)
df['shot_attempt_type'] = df['action_html'].apply(lambda x: \
'2' if '2-pt' in x.text.lower() else \
'3' if '3-pt' in x.text.lower() else \
'free throw' if 'free throw' in x.text.lower() else \
'fouled' if 'shooting foul' in x.text.lower() and \
'drawn by' in x.text.lower() else \
np.nan)
df['shot_made'] = df['action_html'].apply(lambda x: \
True if 'makes' in x.text else \
False if 'misses' in x.text else \
np.nan)
df['assisted_by'] = df['action_html'].apply(lambda x: \
x.select('a:nth-of-type(2)')[0]['href'] if \
x.select('a:nth-of-type(2)') and \
'assist' in x.text.lower() \
else np.nan
)
df['rebounded_by'] = df['action_html'].apply(lambda x: \
x.a['href'] if x.a and 'rebound' in x.text.lower() \
else np.nan
).shift(-1)
df['rebound_type'] = df['action_html'].apply(lambda x: \
'offensive' if 'offensive' in x.text.lower() and \
'rebound' in x.text.lower() \
else 'defensive' if 'defensive' in x.text.lower() and \
'rebound' in x.text.lower() \
else np.nan
).shift(-1)
def extract_shot_distance(text):
# Note - bball ref fouls don't track distance, so those will be left as NaN
r = re.compile('from (.*) ft').search(text)
r2 = 'at rim' in text
if r2:
return 0
elif r:
try:
return int(r.group(1))
except:
return None
else:
return None
df['shot_distance'] = df['action_html'].apply(lambda x: \
extract_shot_distance(x.text) if \
'makes' in x.text or 'misses' in x.text \
else np.nan
)
In [494]:
# We know that each player has to perform his own shot attempts, so we can group by the player and
# by gametime since the game clock doens't change during FT attempts
# We can then count free throw attempts in this window, makes/misses
ft_made = df[ (df['shot_attempt_type'] == 'free throw') | \
(df['shot_attempt_type'] == 'fouled')].groupby(\
['player', 'gametime']).apply(lambda x: \
x[(x['shot_attempt_type'] == 'free throw')]['shot_made'].value_counts()).reset_index()
# NOTE - there is definitely a better way to do this. Added shot attempt type
# column to do an easy merge to only fouled shots. Also, given how the
# groupby.value_counts() returns data, i did a ftm and ft missedcolumn back
# to the data frame, and then calculated fta. Again, almost definitely a
# better way to do it, this was the first thought i had that was easy to implement
ft_made['shot_attempt_type'] = 'fouled'
ft_made.columns = ['player', 'gametime', 'shot_made', 'num', 'shot_attempt_type']
df['ftm'] = df.merge(ft_made[ft_made['shot_made'] == True][\
['player', 'gametime', 'num', 'shot_attempt_type']], \
on=['player', 'gametime', 'shot_attempt_type'], \
how='left')['num']
df['ft missed'] = df.merge(ft_made[ft_made['shot_made'] == False][\
['player', 'gametime', 'num', 'shot_attempt_type']], \
on=['player', 'gametime', 'shot_attempt_type'], \
how='left')['num']
df.loc[ (pd.isnull(df['ft missed'])) & (df.shot_attempt_type == 'fouled'), \
'ft missed'] = 0
df.loc[ (pd.isnull(df['ftm'])) & (df.shot_attempt_type == 'fouled'), 'ftm'] = 0
df['fta'] = df['ftm'] + df['ft missed']
In [444]:
In [ ]:
In [495]:
# Last step is team score and opponent score - need to switch based on home/away action
df['team_score'] = df[ df['is_home'] ]['home_score']
df['team_score'].fillna(df[ df['is_home'] == False]['away_score'], inplace=True)
df['opponent_score'] = df[ df['is_home'] == False ]['home_score']
df['opponent_score'].fillna(df[ df['is_home']]['away_score'], inplace=True)
In [ ]:
In [496]:
game_shots = df[(pd.isnull(df.shot_attempt_type) == False) & \
(df.shot_attempt_type != 'free throw') ]\
[['gametime', 'quarter', 'is_home', 'team_score', 'opponent_score', \
'player', 'shot_attempt_type', 'shot_distance', 'shot_made', \
'assisted_by', 'rebounded_by', 'rebound_type', 'ftm', 'fta']].copy()
In [498]:
game_shots[:15]
Out[498]:
Check a few stats here against box score stats. Even better - turn that into an automated method/test - download/parse box score stats and calculate/compare!
Stats to check for a given player:
In [499]:
# To be continued
In [ ]:
In [195]:
In [196]:
In [197]:
In [ ]:
In [199]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: