In [178]:
import pybaseball as pyball
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [179]:
# Get all NL Central Team Records for 2018
year = 2018
brewers = pyball.schedule_and_record(year, 'MIL')
cardinals = pyball.schedule_and_record(year, 'STL')
cubs = pyball.schedule_and_record(year, 'CHC')
pirates = pyball.schedule_and_record(year, 'PIT')
reds = pyball.schedule_and_record(year, 'CIN')
A quick glance at the summary statistics for each team in order of their ending division rank.
Looking at these statistics, you can see the similarity in overall performance. Mean Runs are very similar for the top three teams with the #3 Cardinals slightly edging out the Brewers and Cubs. However, in Runs Allowed, the #2 Cubs come out in front, with the Brewers second, and the Cardinals third.
Interestingly, the hottest streak belongs to the Pirates, who ended the season ranked 4th in the division.
In [180]:
brewers.describe()
Out[180]:
In [181]:
cubs.describe()
Out[181]:
In [182]:
cardinals.describe()
Out[182]:
In [183]:
pirates.describe()
Out[183]:
In [184]:
reds.describe()
Out[184]:
First we gather each team's Wins, Losses, and Win Percentages at the time of each game
In [185]:
# Gets wins-to-date, losses-to-date, and win_percent-to-date for each team
brewers['Wins'] = np.where(brewers['W/L']=='W', 1, (np.where(brewers['W/L']=='W-wo', 1, 0))).cumsum()
brewers['Losses'] = np.where(brewers['W/L']=='L', 1, (np.where(brewers['W/L']=='L-wo', 1, 0))).cumsum()
brewers['Win_Percentage'] = brewers['Wins'] / (brewers['Wins'] + brewers['Losses'])
cubs['Wins'] = np.where(cubs['W/L']=='W', 1, (np.where(cubs['W/L']=='W-wo', 1, 0))).cumsum()
cubs['Losses'] = np.where(cubs['W/L']=='L', 1, (np.where(cubs['W/L']=='L-wo', 1, 0))).cumsum()
cubs['Win_Percentage'] = cubs['Wins'] / (cubs['Wins'] + cubs['Losses'])
cardinals['Wins'] = np.where(cardinals['W/L']=='W', 1, (np.where(cardinals['W/L']=='W-wo', 1, 0))).cumsum()
cardinals['Losses'] = np.where(cardinals['W/L']=='L', 1, (np.where(cardinals['W/L']=='L-wo', 1, 0))).cumsum()
cardinals['Win_Percentage'] = cardinals['Wins'] / (cardinals['Wins'] + cardinals['Losses'])
pirates['Wins'] = np.where(pirates['W/L']=='W', 1, (np.where(pirates['W/L']=='W-wo', 1, 0))).cumsum()
pirates['Losses'] = np.where(pirates['W/L']=='L', 1, (np.where(pirates['W/L']=='L-wo', 1, 0))).cumsum()
pirates['Win_Percentage'] = pirates['Wins'] / (pirates['Wins'] + pirates['Losses'])
reds['Wins'] = np.where(reds['W/L']=='W', 1, (np.where(reds['W/L']=='W-wo', 1, 0))).cumsum()
reds['Losses'] = np.where(reds['W/L']=='L', 1, (np.where(reds['W/L']=='L-wo', 1, 0))).cumsum()
reds['Win_Percentage'] = reds['Wins'] / (reds['Wins'] + reds['Losses'])
Here we can see that the Brewers and Cubs were in a tightly contested race all year long. At about game 120 the brewers went into a small slump while the Cardinals ascended, leading to a short period where the Brewers dropped to 3rd place. This didn't last long however, as the Brewers got back into a groove and found their way to 2nd and ultimately 1st place.
In [186]:
# Graph Wins Comparison #
plt.rcParams["figure.figsize"] = (8,6)
plt.plot(brewers['Wins'], label='Brewers', c='navy')
plt.plot(cardinals['Wins'], label='Cardinals', c='#b30000')
plt.plot(cubs['Wins'], label='Cubs', c='blue')
plt.plot(pirates['Wins'], label='Pirates', c='gold')
plt.plot(reds['Wins'], label='Reds', c='red')
plt.xticks(np.arange(0, len(brewers.index), step=10))
plt.xlabel('Game')
plt.ylabel('Wins')
plt.legend(loc='lower right')
plt.title("NL Central Wins Comparison ({})".format(year))
Out[186]:
In [187]:
# Graph Win Percentage Comparison #
plt.rcParams["figure.figsize"] = (8,6)
plt.plot(brewers['Win_Percentage'], label='Brewers', c='navy')
plt.plot(cardinals['Win_Percentage'], label='Cardinals', c='#b30000')
plt.plot(cubs['Win_Percentage'], label='Cubs', c='blue')
plt.plot(pirates['Win_Percentage'], label='Pirates', c='gold')
plt.plot(reds['Win_Percentage'], label='Reds', c='red')
plt.xticks(np.arange(0, len(brewers.index), step=10))
plt.xlabel('Game')
plt.ylabel('Win Percentage')
plt.legend(loc='lower right')
plt.title("NL Central Win Percentage Comparison ({})".format(year))
Out[187]:
First we create a modified column to numerically represent 'Games Back'
In [188]:
# Insert new column called 'int_GB' that represents GB as either negative (meaning behind) or 0.0 meaning in first or tied for first
def modify_gb(gb):
if gb == 'Tied':
gb = 0.0
elif 'up' in gb:
gb = 0.0 # float(gb.replace('up ', ''))
else:
gb = -float(gb)
return gb
brewers['int_GB'] = brewers['GB'].dropna().apply(modify_gb)
cardinals['int_GB'] = cardinals['GB'].dropna().apply(modify_gb)
cubs['int_GB'] = cubs['GB'].dropna().apply(modify_gb)
pirates['int_GB'] = pirates['GB'].dropna().apply(modify_gb)
reds['int_GB'] = reds['GB'].dropna().apply(modify_gb)
Graphing 'Games Back' gives an even clearer image of how the NL Central was contended throughout the year.
The Reds fell to an early 10 Games Back, and never recovered. They ended the year at a season low of 28 Games Back.
The pirates, while contending early on, fell to ~13 Games Back mid-season. They showed signs of life in games 90-110, even surpassing the Cardinals for 3rd place. However, they would soon fall to a season low of ~15 Games Back.
The Brewers and Cubs were consistely within ~4 Games of each other for most of the season. While the Brewers had a rough patch in games 110-135 -- dropping to a season low of 6 Games Back, and falling behind the Cardinals for 2nd place -- they quickly recovered in the final games of the season. They would go on to defeat the Cubs in a Game 163 Divisional Tie-Breaker, becoming the NL Central Champions.
In [189]:
# Graph NL Central Games Back Comparison #
plt.rcParams["figure.figsize"] = (8,6)
plt.plot(brewers['int_GB'], label='Brewers', c='navy')
plt.plot(cardinals['int_GB'], label='Cardinals', c='#b30000')
plt.plot(cubs['int_GB'], label='Cubs', c='blue')
plt.plot(pirates['int_GB'], label='Pirates', c='gold')
plt.plot(reds['int_GB'], label='Reds', c='red')
plt.xticks(np.arange(0, len(brewers.index), step=10))
plt.yticks(np.arange(0, -30, step=-2))
plt.xlabel('Game')
plt.ylabel('Games Back (in negative)')
plt.title("NL Central Games Back Comparison ({})".format(year))
plt.legend(loc='lower left')
Out[189]:
First we create new columns for Total Runs, and Total Runs Allowed
In [190]:
# Get total runs-to-date and total runs_allowed-to-date for each team
brewers['Total_Runs'] = brewers['R'].cumsum()
brewers['Total_RA'] = brewers['RA'].cumsum()
cubs['Total_Runs'] = cubs['R'].cumsum()
cubs['Total_RA'] = cubs['RA'].cumsum()
cardinals['Total_Runs'] = cardinals['R'].cumsum()
cardinals['Total_RA'] = cardinals['RA'].cumsum()
pirates['Total_Runs'] = pirates['R'].cumsum()
pirates['Total_RA'] = pirates['RA'].cumsum()
reds['Total_Runs'] = reds['R'].cumsum()
reds['Total_RA'] = reds['RA'].cumsum()
Graphing Runs and Runs Allowed illustrates some of the causal factors in each teams divisional rank.
The Reds were scoring Runs at a rate similar to every team but the Cubs most season long. However, they were the worst in Runs Allowed by a large margin. By the end of the season, the Reds had given up over 800 runs while no other team broke the 700 mark.
The Pirates had as similar problem. While they scored at comparable rates to the rest of the division, their pitching didn't fare so well. They were the 2nd worst in Runs Allowed almost all season.
The Cardinals were also scoring at rates comparable to the rest of the division. However, in Runs Allowed they consistently outpaced the Brewers and Cubs until late season.
You can see the decreased rate of Runs Allowed for the Cardinals from games 110-130, while in the same stretch of time the Brewers saw an increase in their rate of Runs Allowed. This coincides with the time period in which the Cardinals and Brewers swapped for 2nd place.
The Cubs consistently outscored the entire division right up to the end of the season. At the same time, their Runs Allowed were close to the Brewers all year. By these two graphs alone, I would have predicted the Cubs won the division. It would, however, come down to the final tie-breaker game, with the Brewers coming away victorious.
In [191]:
# Graph Runs and Runs Allowed Comparisons #
plt.rcParams["figure.figsize"] = (16,8)
plt.subplot(1,2,1)
plt.plot(brewers['Total_Runs'], label='Brewers', c='navy')
plt.plot(cardinals['Total_Runs'], label='Cardinals', c='#b30000')
plt.plot(cubs['Total_Runs'], label='Cubs', c='blue')
plt.plot(pirates['Total_Runs'], label='Pirates', c='gold')
plt.plot(reds['Total_Runs'], label='Reds', c='red')
plt.xticks(np.arange(0, len(brewers.index), step=10))
plt.xlabel('Game')
plt.ylabel('Runs')
plt.title("NL Central Runs Comparison ({})".format(year))
plt.legend(loc='lower right')
plt.subplot(1,2,2)
plt.plot(brewers['Total_RA'], label='Brewers', c='navy')
plt.plot(cardinals['Total_RA'], label='Cardinals', c='#b30000')
plt.plot(cubs['Total_RA'], label='Cubs', c='blue')
plt.plot(pirates['Total_RA'], label='Pirates', c='gold')
plt.plot(reds['Total_RA'], label='Reds', c='red')
plt.xticks(np.arange(0, len(brewers.index), step=10))
plt.xlabel('Game')
plt.ylabel('Runs Allowed')
plt.title("NL Central Runs Allowed Comparison ({})".format(year))
plt.legend(loc='lower right')
Out[191]:
First we get each teams total home_attendance-to-date
In [192]:
brewers['Total_Home_Attendance'] = brewers[brewers['Home_Away'] == 'Home']['Attendance'].dropna().cumsum()
cubs['Total_Home_Attendance'] = cubs[cubs['Home_Away'] == 'Home']['Attendance'].dropna().cumsum()
cardinals['Total_Home_Attendance'] = cardinals[cardinals['Home_Away'] == 'Home']['Attendance'].dropna().cumsum()
pirates['Total_Home_Attendance'] = pirates[pirates['Home_Away'] == 'Home']['Attendance'].dropna().cumsum()
reds['Total_Home_Attendance'] = reds[reds['Home_Away'] == 'Home']['Attendance'].dropna().cumsum()
The Reds and Pirates saw dramatically less people come to the ballpark compared to the rest of the division, with the Pirates coming in last with ~1.5 million attendees.
The Cubs beat out the Brewers by about 500k attendees. Given Miller Park's capacity (41,900) and Wrigley Field's Capacity (41,649), this seems to show a greater interest from Cubs fans than Brewers fans.
The Cardinals come in first at ~3.4 million attendees. This may be caused by Busch Stadium's slightly larger capacity of 49,676.
In [193]:
# Graph Total Home Attendance Comparison #
plt.rcParams["figure.figsize"] = (8,8)
# Create new Series for home attendance #
brewers_attendance = brewers['Total_Home_Attendance'].dropna()
brewers_attendance.index = range(len(brewers_attendance))
cardinals_attendance = cardinals['Total_Home_Attendance'].dropna()
cardinals_attendance.index = range(len(cardinals_attendance))
cubs_attendance = cubs['Total_Home_Attendance'].dropna()
cubs_attendance.index = range(len(cubs_attendance))
pirates_attendance = pirates['Total_Home_Attendance'].dropna()
pirates_attendance.index = range(len(pirates_attendance))
reds_attendance = reds['Total_Home_Attendance'].dropna()
reds_attendance.index = range(len(reds_attendance))
plt.plot(brewers_attendance, label='Brewers', c='navy')
plt.plot(cardinals_attendance, label='Cardinals', c='#b30000')
plt.plot(cubs_attendance, label='Cubs', c='blue')
plt.plot(pirates_attendance, label='Pirates', c='gold')
plt.plot(reds_attendance, label='Reds', c='red')
plt.xlabel('Game')
plt.ylabel('Total Season Attendance')
plt.title("NL Central Total Home Attendance Comparison ({})".format(year))
plt.legend(loc='lower right')
Out[193]: