In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
filename = 'baseball_core/TeamsFranchises.csv'
franchises = pd.read_csv(filename)
filename1 = 'baseball_core/Teams.csv'
team_perf_historical = pd.read_csv(filename1)
# Player Specific Statistics
filename2 = 'baseball_core/Salaries.csv'
salaries_historical = pd.read_csv(filename2)
filename3 = 'baseball_core/Batting.csv'
batting_historical = pd.read_csv(filename3)
filename4 = 'baseball_core/Pitching.csv'
pitching_historical = pd.read_csv(filename4)
filename5 = 'baseball_core/Fielding.csv'
fielding_historical = pd.read_csv(filename5)
filename6 = 'baseball_core/Master.csv'
master = pd.read_csv(filename6)
filename7 = 'baseball_core/AllstarFull.csv'
allstar_historical = pd.read_csv(filename7)
In [2]:
# ----- Extract Player IDs for Debugging -------------
# temp = master[master['nameLast']=='Sale']
# print temp[temp['nameGiven']=='Christopher']
# ----- Results -----
# David Ortiz - ortizda01
# Chris Sale - salech01
# --------- Does the Data Make Sense? ----------------
temp_b = batting_historical[batting_historical['playerID']=='salech01']
# print temp_b[temp_b['yearID']==2016]
temp_p = pitching_historical[pitching_historical['playerID']=='salech01']
# print temp_p[['HR','yearID','G', 'H','WP']]
# print temp_p[temp_p['yearID']==2016]
temp_f = fielding_historical[fielding_historical['playerID']=='salech01']
# print temp_f[temp_f['yearID']==2016]
In [3]:
# This section is going to be used to clean-up
# column names before creating the stitched
# player profiles.
batting_historical = batting_historical.rename(columns={'R':'Runs', 'H':'Hits', 'HR':'HomerunsScored', \
'G':'GamesPlayed','SO':'Strikeouts', 'BB':'BaseOnBalls',\
'2B':'Doubles', '3B':'Triples','CS':'CSBatting'})
pitching_historical = pitching_historical.rename(columns={'G':'GamesPlayed', 'W':'Wins', 'L':'Losses', \
'CG':'GamesCompleted', 'H':'HitsAgainst', 'HR':'HomerunsAllowed',\
'R':'RunsAllowed', 'BB':'Walks', 'SH':'SHAgainst', \
'HBP':'BattersHBP', 'GIDP':'GIDPAllowed', 'SH':'SHAllowed', \
'IBB':'IBBAllowed', 'SF':'SFAllowed', 'GS':'GamesStarted'})
fielding_historical = fielding_historical.rename(columns={'SB':'CatcherSB', 'G':'GamesPlayed'})
# Cleanup to fill NANs with 0's to prevent merge from failing
# for metrics like Wild Pitches and Zone Rating
fielding_historical = fielding_historical.fillna(0)
allstar_historical = allstar_historical.rename(columns={'gameNum':'GamesPlayedAS'})
In [ ]:
# Verify that Column adjustments worked
print batting_historical[batting_historical['yearID']>2015].head(n=1)
print pitching_historical[pitching_historical['yearID']>2015].head(n=1)
print fielding_historical[fielding_historical['yearID']>2015].head(n=1)
print allstar_historical[allstar_historical['yearID']>2015].head(n=1)
In [5]:
# temp_sal = salaries_historical[salaries_historical['yearID'] > 2015]
temp_bat = batting_historical[batting_historical['yearID'] > 2015]
temp_pitch = pitching_historical[pitching_historical['yearID'] > 2015]
temp_field = fielding_historical[fielding_historical['yearID'] > 2015]
temp1 = pd.merge(temp_bat, temp_pitch, on=['yearID','playerID','teamID','lgID','GamesPlayed','stint'], how='inner')
In [6]:
temp2 = pd.merge(temp1, temp_field, on=['yearID','playerID','teamID','lgID','GamesPlayed','stint'], how='inner')
# Cleaning up merge
temp2['WildPitches'] = temp2['WP_x'].add(temp2['WP_y']).fillna(0)
del temp2['WP_x']
del temp2['WP_y']
# Add all star column to bat+pitch+field table
temp2['isAllStar'] = 0
# print temp2.head(n=2)
print
In [ ]: