In [1]:
import os
import urllib
import webbrowser
import pandas as pd
from bs4 import BeautifulSoup
In [2]:
url = 'http://www.pro-football-reference.com/years/2015/passing.htm'
webbrowser.open_new_tab(url)
Out[2]:
In [3]:
# The url we will be scraping
url_2015 = "http://www.pro-football-reference.com/years/2015/passing.htm"
# get the html
html = urllib.request.urlopen(url_2015)
# create the BeautifulSoup object
soup = BeautifulSoup(html, "lxml")
In [4]:
# Extract the necessary values for the column headers from the table
# and store them as a list
column_headers = [th.getText() for th in soup.findAll('th', limit=30)]
column_headers = [s for s in column_headers if len(s) != 0]
column_headers = column_headers[1:]
column_headers
Out[4]:
In [5]:
len(column_headers)
Out[5]:
In [6]:
# The data is found within the table rows
# We want the elements from the 3rd row and on
table_rows = soup.find_all("tr")[1:]
print(type(table_rows))
table_rows[0] # take a look at the first row
Out[6]:
The data we want for each player is found within the the td
(or table data) elements.
Below I've created a function that extracts the data we want from table_rows
. The comments should walk you through what each part of the function does.
In [7]:
def extract_player_data(table_rows):
"""
Extract and return the the desired information from the td elements within
the table rows.
"""
# create the empty list to store the player data
player_data = []
for row in table_rows: # for each row do the following
# Get the text for each table data (td) element in the row
player_list = [td.get_text() for td in row.find_all("td")]
# there are some empty table rows, which are the repeated
# column headers in the table
# we skip over those rows and and continue the for loop
if not player_list:
continue
# Now append the data to list of data
player_data.append(player_list)
return player_data
Now we can create a DataFrame
with the 2016 passing data
In [8]:
# extract the data we want
data = extract_player_data(table_rows)
# and then store it in a DataFrame
df_2015 = pd.DataFrame(data)
In [9]:
data[1]
Out[9]:
In [10]:
df_2015.columns = ['Player_name']+column_headers
In [11]:
df_2015.head()
Out[11]:
Scraping the for passing data since 1932 follows is essentially the same process as above, just repeated for each year, using a for
loop.
As we loop over the years, we will create a DataFrame
for each year of passing data, and append it to a large list of DataFrame
s that contains all the passing data. We will also have a separate list that will contain any errors and the url associated with that error. This will let us know if there are any issues with our scraper, and which url is causing the error.
In [12]:
# Create an empty list that will contain all the dataframes
# (one dataframe for all passing dataframes)
passing_dfs_list = []
# a list to store any errors that may come up while scraping
errors_list = []
In [13]:
# The url template that we pass in the passing year info
url_template = "http://www.pro-football-reference.com/years/{year}/passing.htm"
# for each year from 1932 to (and including) 2016
for year in range(1932, 2017):
# Use try/except block to catch and inspect any urls that cause an error
try:
# get the passing data url
url = url_template.format(year=year)
# get the html
html = urllib.request.urlopen(url)
# create the BeautifulSoup object
soup = BeautifulSoup(html, "lxml")
# get the column headers
column_headers = [th.getText() for th in soup.findAll('th', limit=30)]
column_headers = [s for s in column_headers if len(s) != 0]
column_headers = column_headers[1:]
# select the data from the table
table_rows = soup.find_all("tr")[1:]
# extract the player data from the table rows
player_data = extract_player_data(table_rows)
# create the dataframe for the current year's passing data
year_df = pd.DataFrame(player_data)
# add the year of the pssing data to the dataframe
year_df.insert(0, "Year", year)
# append the current dataframe to the list of dataframes
passing_dfs_list.append(year_df)
except Exception as e:
# Store the url and the error it causes in a list
error =[url, e]
# then append it to the list of errors
errors_list.append(error)
In [14]:
print(len(errors_list))
errors_list
Out[14]:
We don't get any errors, so that's good.
Now we can concatenate all the DataFrame
s we scraped and create one large DataFrame
containing all the passing data
In [15]:
type(passing_dfs_list)
Out[15]:
In [16]:
passing_dfs_list[0:1]
Out[16]:
In [17]:
column_headers.insert(0, "Player_name")
column_headers.insert(0, "Year")
print(column_headers)
print(len(column_headers))
In [18]:
# store all passing data in one DataFrame
passing_df = pd.concat(passing_dfs_list, axis=0)
passing_df.columns = column_headers
In [19]:
# Take a look at the first few rows
passing_df.loc[0,:]
Out[19]:
Now that we fixed up the necessary columns, let's write out the raw data to a CSV file.
In [20]:
if not os.path.exists('../data/raw_data'):
os.makedirs('../data/raw_data')
if not os.path.exists('../data/clean_data'):
os.makedirs('../data/clean_data')
In [21]:
passing_df.head()
Out[21]:
In [22]:
# Write out the raw passing data to the raw_data fold in the data folder
passing_df.to_csv("../data/raw_data/pfr_nfl_passing_data_RAW.csv", index=False)
In [23]:
passing_df_clean = pd.read_csv("../data/raw_data/pfr_nfl_passing_data_RAW.csv")
In [24]:
passing_columns = {
'Year':'year', 'Player_name': 'name', 'Tm': 'team', 'Age': 'age',
'Pos': 'position', 'G': 'games_played', 'GS':
'games_started', 'QBrec': 'record', 'Cmp': 'completions',
'Att': 'attempts', 'Cmp%': 'completionPct', 'Yds':
'passing_yards', 'TD': 'passing_TD', 'TD%':
'passing_TDPct', 'Int': 'passing_INT', 'Int%':
'passing_INTPct', 'Lng': 'passing_long', 'Y/A':
'passing_ydsAtt', 'AY/A': 'passing_airydsAtt', 'Y/C':
'passing_ydsComp', 'Y/G': 'passing_ydsGame', 'Rate':
'passing_rating', 'QBR': 'QBR','Sk': 'passing_sacks', 'Yds.1':
'passing_sacksyds','NY/A': 'netydsAtt', 'ANY/A': 'passing_airnetydsAtt',
'Sk%': 'passing_sackPct', '4QC': 'FourthQtrComebacks',
'GWD': 'gamewinningdrives'
}
ordered_columns = [
'year', 'name', 'team', 'age', 'position', 'wins',
'losses', 'games_played', 'games_started', 'completions',
'attempts', 'completionPct', 'passing_yards',
'passing_TD', 'passing_TDPct', 'passing_INT',
'passing_INTPct', 'passing_long', 'passing_ydsAtt',
'passing_airydsAtt', 'passing_ydsComp',
'passing_ydsGame', 'passing_rating', 'passing_sacks',
'passing_sacksyds', 'passing_airnetydsAtt',
'passing_sackPct', 'FourthQtrComebacks',
'gamewinningdrives'
]
In [25]:
passing_df_clean.rename(index=str,columns=passing_columns, inplace=True)
passing_df_clean.head()
Out[25]:
In [26]:
passing_df_clean['record'] = passing_df_clean['record'].astype('str')
passing_df_clean['position'] = passing_df_clean['position'].astype('str')
passing_df_clean['position'] = passing_df_clean['position'].str.upper()
passing_df_clean['record'].loc[passing_df_clean.record == 'QBrec'] = "0-0-0"
passing_df_clean['record'].loc[passing_df_clean.record == 'nan'] = "0-0-0"
unformatted_record = passing_df_clean['record'].str[:]
unformatted_record = unformatted_record.str.replace("-", "/")
passing_df_clean['wins'] = unformatted_record.str.split("/").str[0]
passing_df_clean['losses'] = unformatted_record.str.split("/").str[1]
passing_df_clean['name'] = passing_df_clean['name'].str.replace('[+|*]', "")
passing_df_clean['wins'] = passing_df_clean['wins'].astype('float')
passing_df_clean = pd.DataFrame(data=passing_df_clean, columns=ordered_columns)
In [27]:
# convert the data to proper numeric types
passing_df_clean = passing_df_clean.apply(pd.to_numeric, errors="ignore")
passing_df_clean.info()
We are not done yet. A lot of out numeric columns are missing data because players didn't accumulate any of those stats. For example, some players didn't score a TD or even play a game. Let's select the columns with numeric data and then replace the NaN
s (the current value that represents the missing data) with 0s, as that is a more appropriate value.
In [28]:
# Get the column names for the numeric columns
num_cols = passing_df_clean.columns[passing_df_clean.dtypes != object]
# Replace all NaNs with 0
passing_df_clean.loc[:, num_cols] = passing_df_clean.loc[:, num_cols].fillna(0)
In [29]:
passing_df_clean.info()
We are finally done cleaning the data and now we can save it to a CSV file.
In [30]:
passing_df_clean = passing_df_clean[pd.notnull(passing_df_clean['name'])]
passing_df_clean.sort_values('wins', ascending=False, axis=0, inplace=True)
In [31]:
passing_df_clean.to_csv("../data/clean_data/pfr_nfl_passing_data_CLEAN.csv", index=False)
passing_df_clean.head(10)
Out[31]:
In [32]:
import sys
import bs4
print("System and module version information: \n")
print('Python version:', sys.version_info)
print('urllib.request version:', urllib.request.__version__)
print('pandas version:', pd.__version__)
print('Beautiful Soup version:', bs4.__version__)