In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
filename = 'baseball_core/TeamsFranchises.csv'
franchises = pd.read_csv(filename)
filename1 = 'baseball_core/Teams.csv'
team_perf_historical = pd.read_csv(filename1)
# Player Specific Statistics
filename2 = 'baseball_core/Salaries.csv'
salaries_historical = pd.read_csv(filename2)
filename3 = 'baseball_core/Batting.csv'
batting_historical = pd.read_csv(filename3)
filename4 = 'baseball_core/Pitching.csv'
pitching_historical = pd.read_csv(filename4)
filename5 = 'baseball_core/Fielding.csv'
fielding_historical = pd.read_csv(filename5)
filename6 = 'baseball_core/Master.csv'
master = pd.read_csv(filename6)
filename7 = 'baseball_core/AllstarFull.csv'
allstar_historical = pd.read_csv(filename7)
filename8 = 'baseball_core/SalaryRange.csv'
salaryrange_historical = pd.read_csv(filename8)
In [10]:
# ----- Extract Player IDs for Debugging -------------
# temp = master[master['nameLast']=='Sale']
# print temp[temp['nameGiven']=='Christopher']
# print master[master['playerID']=='alexasc01']
# ----- Results -----
# David Ortiz - ortizda01
# Chris Sale - salech01
# alexasc01
# --------- Does the Data Make Sense? ----------------
temp_b = batting_historical[batting_historical['playerID']=='salech01']
# print temp_b[temp_b['yearID']==2016]
temp_p = pitching_historical[pitching_historical['playerID']=='salech01']
# print temp_p[['HR','yearID','G', 'H','WP']]
# print temp_p[temp_p['yearID']==2016]
temp_f = fielding_historical[fielding_historical['playerID']=='salech01']
# print temp_f[temp_f['yearID']==2016]
temp_as = allstar_historical[allstar_historical['playerID']=='salech01']
# print temp_as[temp_as['yearID']==2016]
# print salaryrange_historical
In [11]:
# This section is going to be used to clean-up
# column names before creating the stitched
# player profiles.
# Renaming Batting table columns to help with merging
batting_historical = batting_historical.rename(columns={'R':'Runs', 'H':'Hits', 'HR':'HomerunsScored', \
'G':'GamesPlayed','SO':'Strikeouts', 'BB':'BaseOnBalls',\
'2B':'Doubles', '3B':'Triples','CS':'CSBatting'})
# Renaming Pitching table columns to help with merging
pitching_historical = pitching_historical.rename(columns={'G':'GamesPlayed', 'W':'Wins', 'L':'Losses', \
'CG':'GamesCompleted', 'H':'HitsAgainst', 'HR':'HomerunsAllowed',\
'R':'RunsAllowed', 'BB':'Walks', 'SH':'SHAgainst', \
'HBP':'BattersHBP', 'GIDP':'GIDPAllowed', 'SH':'SHAllowed', \
'IBB':'IBBAllowed', 'SF':'SFAllowed', 'GS':'GamesStarted'})
# Renaming Fielding table columns to help with merging
fielding_historical = fielding_historical.rename(columns={'SB':'CatcherSB', 'G':'GamesPlayed'})
# Cleanup to fill NANs with 0's to prevent merge from failing
# for metrics like Wild Pitches and Zone Rating
fielding_historical = fielding_historical.fillna(0)
# The allstar table will be cleaned up to represent a couple things nicely as follows:
# 1) We will be using GP column as gameNum attmpts to account\
# for years with 2 allstar games between 1959-mid 60's
# 2) We're setting GP to 0 where the player was selected to play but didn't\
# actually play. This is to prevent the player from impacting analysis
allstar_historical = allstar_historical.rename(columns={'GP':'GamesPlayedAS', 'startingPos':'startingPosAS'})
allstar_historical = allstar_historical[['yearID','playerID','teamID','lgID','GamesPlayedAS','startingPosAS']]
allstar_historical.ix[pd.isnull(allstar_historical['startingPosAS']), 'GamesPlayedAS'] = 0
allstar_historical['startingPosAS'] = allstar_historical['startingPosAS'].fillna(0)
# We will need this for fixing salaries on stitched player profiles
years = salaryrange_historical.yearID.unique()
salaryrange_historical = salaryrange_historical.set_index(['yearID'])
In [12]:
# Verify that Column adjustments worked
print batting_historical[batting_historical['yearID']>2015].head(n=1)
print pitching_historical[pitching_historical['yearID']>2015].head(n=1)
print fielding_historical[fielding_historical['yearID']>2015].head(n=1)
print allstar_historical[allstar_historical['yearID']>2015].head(n=1)
print salaries_historical[salaries_historical['yearID']>2015].head(n=1)
In [13]:
# Here we're going to make a decision to only use data from
# 1967 onwards. This is primarily because structured data for
# salaries is available only from that year onwards
batting_historical = batting_historical[batting_historical['yearID'] > 1966]
pitching_historical = pitching_historical[pitching_historical['yearID'] > 1966]
fielding_historical = fielding_historical[fielding_historical['yearID'] > 1966]
allstar_historical = allstar_historical[allstar_historical['yearID'] > 1966]
salaries_historical = salaries_historical[salaries_historical['yearID'] > 1966]
# Begin stitching together stats:
# 1) Merge Batting and Pitching
player_profile = pd.merge(batting_historical, pitching_historical, on=['yearID','playerID','teamID','lgID','GamesPlayed','stint'], how='inner')
In [14]:
# 2) Merge Step 1. with Fielding
player_profile = pd.merge(player_profile, fielding_historical, on=['yearID','playerID','teamID','lgID','GamesPlayed','stint'], how='inner')
# Cleaning up merge
player_profile['WildPitches'] = player_profile['WP_x'].add(player_profile['WP_y']).fillna(0)
del player_profile['WP_x']
del player_profile['WP_y']
In [15]:
# 3) Merge Step 2. with AllStar data
player_profile = pd.merge(player_profile, allstar_historical, on=['yearID','playerID','teamID','lgID'], how='left')
player_profile[['startingPosAS','GamesPlayedAS']] = player_profile[['startingPosAS','GamesPlayedAS']].fillna(0)
In [16]:
# 4) Merge Step 3. with Salary data
player_profile = pd.merge(player_profile, salaries_historical, on=['yearID','playerID','teamID','lgID'], how='left')
# It turns out that we don't have salary data
# for some players in the MLB. However, we do
# have minimum and average salary data for the
# MLB from 1967 onwards though. For players with
# no salary data, we will be using the minimum
# salary for that year
for year in years:
swap = player_profile[player_profile['yearID']==year]
swap.ix[pd.isnull(swap['salary']), 'salary'] = salaryrange_historical.loc[year]['minSalary']
player_profile[player_profile['yearID']==year] = swap
In [17]:
print player_profile.columns
print player_profile.head(n=5)
In [ ]:
In [ ]: