In the last two blog posts we:
We will look select from the MVP finalists in 2017-2018
On May 17th, the NBA announced the 2018-2019 MVP finalists:
In [1]:
import os
import urllib
import webbrowser
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
In [2]:
url_2018_mvp_finalist = "https://www.basketball-reference.com/awards/awards_2018.html"
In [3]:
webbrowser.open_new_tab(url_2018_mvp_finalist)
Out[3]:
In [4]:
html_finalist = urllib.request.urlopen(url_2018_mvp_finalist)
soup_finalist = BeautifulSoup(html_finalist, "lxml")
In [5]:
column_headers_finalist = [th.get_text() for th in soup_finalist.find_all('th', limit=30)]
column_headers_finalist = [header for header in column_headers_finalist if len(header) != 0]
column_headers_finalist = column_headers_finalist[1:]
In [6]:
print(f"raw column names in finalist table: {column_headers_finalist}")
column_headers_finalist = [header for header in column_headers_finalist if header not in ('Shooting', 'Advanced', 'Per Game')][:-4]
print(f"formatted column names in finalist table: {column_headers_finalist}")
print(f"{len(column_headers_finalist)} columns in finalist table")
tr
elements of the first tbody
element
In [7]:
table_rows_finalist = soup_finalist.find("tbody").find_all("tr")
print(f"the subset soup object is of type: {type(table_rows_finalist)}")
table_rows_finalist[-1]
Out[7]:
BeautifulSoup
object
In [8]:
player_href = [href for href in table_rows_finalist[-1].find_all("a")][0]["href"]
player_href
Out[8]:
In [9]:
base_basketball_ref_url = "https://www.basketball-reference.com"
In [10]:
player_link = base_basketball_ref_url + player_href
player_link
Out[10]:
In [11]:
webbrowser.open_new_tab(player_link)
Out[11]:
In [12]:
def extract_finalist_data(table_rows):
"""
Extract and return the the desired information from the td elements within the table rows.
:param: table_rows: list of soup `tr` elements
:return: list of player-year MVP finalist observations
"""
base_basketball_ref_url = "https://www.basketball-reference.com"
# create the empty list to store the player data
player_data = []
for row in table_rows: # for each row do the following
# Get the text for each table data (td) element in the row
player_rank = [td.get_text() for td in row.find_all("th")]
player_list = [td.get_text() for td in row.find_all("td")]
player_href = [href for href in row.find_all("a")][0]["href"]
player_link = [base_basketball_ref_url + player_href]
# there are some empty table rows, which are the repeated column headers in the table
# we skip over those rows and and continue the for loop
if not player_list:
continue
# Now append the individual player data to list of all player data
player_info = player_rank+player_list+player_link
player_data.append(player_info)
return player_data
DataFrame
In [13]:
extracted_finalist_2018_data = extract_finalist_data(table_rows_finalist)
mvp_finalist_2018_data = pd.DataFrame(extracted_finalist_2018_data)
mvp_finalist_2018_data.columns = column_headers_finalist+["player_link"]
print(f"the MVP finalist dataframe has {mvp_finalist_2018_data.shape[0]} rows (player-year observations) and {mvp_finalist_2018_data.shape[1]} columns")
mvp_finalist_2018_data.tail(6)
Out[13]:
BeautifulSoup
object
In [14]:
player_test_table = "https://www.basketball-reference.com/players/h/hardeja01.html#per_game::none"
player_profile_request = urllib.request.urlopen(player_test_table)
player_profile_soup = BeautifulSoup(player_profile_request, "lxml")
In [15]:
column_headers_player = [th.get_text() for th in player_profile_soup.find_all('th', limit=30)]
column_headers_player = [header for header in column_headers_player if len(header) != 0]
column_headers_player = column_headers_player[1:]
print(f"the columns in the career data tables are: \n {column_headers_player} \n")
print(f"there are {len(column_headers_player)} total columns in the career table")
In [16]:
player_name = player_profile_soup.find_all("h1")[0].text
player_years_active = [th.get_text() for th in player_profile_soup.find_all("tbody")[0].find_all("th")]
player_career_data = [td.get_text() for td in player_profile_soup.find_all("td")]
In [17]:
print(f"player name: {player_name}")
print(f"player years active: {player_years_active}")
print(f"player career data: {player_career_data}")
In [18]:
def slice_per(source, step):
return [source[i::step] for i in range(step)]
player_career_data
object every 29 steps (using the language of the slice_per
function) to create a year of data
In [19]:
sliced_player_data = slice_per(player_career_data, 29)
print(type(sliced_player_data))
player_career_df = pd.DataFrame(sliced_player_data).transpose()
player_career_df.columns = column_headers_player
player_career_df
Out[19]:
In [20]:
def extract_career_data(player_link):
"""
Extract and return the the desired information from the td elements within the table rows.
:param: table_rows: list of soup `tr` elements
:return: list of player-year MVP finalist observations
"""
player_profile_request = urllib.request.urlopen(player_link)
# create the BeautifulSoup object
player_profile_soup = BeautifulSoup(player_profile_request, "lxml")
extracted_player_data = [td for td in player_profile_soup.find_all("tbody")]
player_name = player_profile_soup.find_all("h1")[0].text
player_years_active = [th.get_text() for th in player_profile_soup.find_all("tbody")[0].find_all("th")]
player_career_data = [td.get_text() for td in extracted_player_data[0].find_all("td")]
# slice list of list (player_data into yearly lists)
sliced_player_data = slice_per(player_career_data, 29)
player_career_df = pd.DataFrame(sliced_player_data).transpose()
player_career_df.insert(0, "Player", player_name)
player_career_df.insert(1, "Year", player_years_active)
return player_career_df
mvp_finalist_2018_data
In [21]:
all_player_career_data = [extract_career_data(player_link) for player_link in mvp_finalist_2018_data["player_link"]]
Pandas
concat function to store all finalist data in one DataFrame
DataFrame
In [22]:
mvp_finalist_2019_career_data = pd.concat(all_player_career_data, axis=0)
print(f"the MVP finalist dataframe has {mvp_finalist_2018_data.shape[0]} rows (player-year observations) and {mvp_finalist_2018_data.shape[1]} columns")
mvp_finalist_2019_career_data.head()
Out[22]:
column_headers_player
with the two new columns we added in our extract_career_data
function
In [23]:
mvp_finalist_2019_career_data.columns = ["Player"] + ["Year"] + column_headers_player
mvp_finalist_2019_career_data.columns
Out[23]:
mvp_finalist_2019_career_data
data
In [24]:
mvp_finalist_2019_career_data.tail()
Out[24]:
Now that we fixed up the necessary columns, let's write out the raw data to a CSV file.
In [25]:
os.makedirs('../data/raw_data', exist_ok=True)
os.makedirs('../data/clean_data', exist_ok=True)
In [26]:
mvp_finalist_2019_career_data.to_csv("../data/raw_data/mvp_finalist_2019_career_data.csv", index=False)
In [27]:
mvp_finalist_2019_df_clean = pd.read_csv("../data/raw_data/mvp_finalist_2019_career_data.csv", encoding = "Latin-1")
mvp_finalist_2019_df_clean.head()
Out[27]:
In [28]:
mvp_finalist_2019_df_clean.columns
Out[28]:
In [29]:
mvp_finalist_columns_dict = {'Player':'player', 'Year':'year', 'Age': 'age', 'Tm': 'team', 'Lg': 'league',
'POS': 'position', 'G': 'games_played', 'GS': 'games_started', 'MP': 'avg_minutes',
'FG': 'field_goals_made_per_game', 'FGA': 'field_goals_attempted_per_game',
'FG%': 'field_goal_pct', '3P': 'three_pt_fg_made_per_game', '3PA': 'three_pt_fg_attempted_per_game',
'3P%': 'three_pt_pct', '2P': 'two_pt_fg_made_per_game', '2PA': 'two_pt_fg_attempted_per_game',
'2P%': 'two_pt_fg_pct', 'eFG%': 'effective_fg_pct',
'FT': 'free_throws_made_per_game', 'FTA': 'free_throws_attempted_per_game',
'FT%': 'free_throw_pct', 'ORB': 'offensive_rebounds_per_game', 'DRB': 'defensive_rebounds_per_game',
'TRB': 'total_rebounds_per_game', 'AST': 'assists_per_game', 'STL': 'steals_per_game',
'BLK': 'blocks_per_game', 'TOV': 'turnovers_per_game', 'PF': 'fouls_committed_per_game', 'PTS': 'points_per_game'
}
In [30]:
mvp_finalist_2019_df_clean.rename(index=str,columns=mvp_finalist_columns_dict, inplace=True)
mvp_finalist_2019_df_clean.head()
Out[30]:
In [31]:
mvp_finalist_2019_df_clean.columns
Out[31]:
In [32]:
mvp_finalist_2019_df_clean = mvp_finalist_2019_df_clean.apply(pd.to_numeric, errors="ignore")
mvp_finalist_2019_df_clean.info()
In [33]:
num_cols_finalist = mvp_finalist_2019_df_clean.columns[mvp_finalist_2019_df_clean.dtypes != object]
mvp_finalist_2019_df_clean.loc[:, num_cols_finalist] = mvp_finalist_2019_df_clean.loc[:, num_cols_finalist].fillna(0)
mvp_finalist_2019_df_clean.info()
In [34]:
mvp_finalist_2019_df_clean = mvp_finalist_2019_df_clean[pd.notnull(mvp_finalist_2019_df_clean['player'])]
mvp_finalist_2019_df_clean.sort_values(['year'], ascending=False, axis=0, inplace=True)
In [35]:
mvp_finalist_2019_df_clean.to_csv("../data/clean_data/mvp_finalist_2019_df_clean.csv", index=False)
print(f" the dimensions for the final data are: {mvp_finalist_2019_df_clean.shape} (rows, columns)")
mvp_finalist_2019_df_clean.head()
Out[35]:
In [36]:
import sys
import bs4
print(f'last updated: {datetime.now().strftime("%Y-%m-%d %H:%M")} \n')
print(f"System and module version information: \n")
print(f"Python version: {sys.version_info}")
print(f"urllib.request version: {urllib.request.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"Beautiful Soup version: {bs4.__version__}")