In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

filename = 'baseball_core/TeamsFranchises.csv'
franchises = pd.read_csv(filename)

filename1 = 'baseball_core/Teams.csv'
team_perf_historical = pd.read_csv(filename1)

# Player Specific Statistics
filename2 = 'baseball_core/Salaries.csv'
salaries_historical = pd.read_csv(filename2)

filename3 = 'baseball_core/Batting.csv'
batting_historical = pd.read_csv(filename3)

filename4 = 'baseball_core/Pitching.csv'
pitching_historical = pd.read_csv(filename4)

filename5 = 'baseball_core/Fielding.csv'
fielding_historical = pd.read_csv(filename5)

filename6 = 'baseball_core/Master.csv'
master = pd.read_csv(filename6)

filename7 = 'baseball_core/AllstarFull.csv'
allstar_historical = pd.read_csv(filename7)

filename8 = 'baseball_core/SalaryRange.csv'
salaryrange_historical = pd.read_csv(filename8)

In [10]:
# ----- Extract Player IDs for Debugging -------------
# temp = master[master['nameLast']=='Sale']
# print temp[temp['nameGiven']=='Christopher']
# print master[master['playerID']=='alexasc01']
# ----- Results ----- 
# David Ortiz - ortizda01
# Chris Sale - salech01
# alexasc01

# --------- Does the Data Make Sense? ----------------
temp_b = batting_historical[batting_historical['playerID']=='salech01']
# print temp_b[temp_b['yearID']==2016]
temp_p = pitching_historical[pitching_historical['playerID']=='salech01']
# print temp_p[['HR','yearID','G', 'H','WP']]
# print temp_p[temp_p['yearID']==2016]
temp_f = fielding_historical[fielding_historical['playerID']=='salech01']
# print temp_f[temp_f['yearID']==2016]
temp_as = allstar_historical[allstar_historical['playerID']=='salech01']
# print temp_as[temp_as['yearID']==2016]
# print salaryrange_historical

In [11]:
# This section is going to be used to clean-up
# column names before creating the stitched 
# player profiles.

# Renaming Batting table columns to help with merging
batting_historical = batting_historical.rename(columns={'R':'Runs', 'H':'Hits', 'HR':'HomerunsScored', \
                                                        'G':'GamesPlayed','SO':'Strikeouts', 'BB':'BaseOnBalls',\
                                                       '2B':'Doubles', '3B':'Triples','CS':'CSBatting'})

# Renaming Pitching table columns to help with merging
pitching_historical = pitching_historical.rename(columns={'G':'GamesPlayed', 'W':'Wins', 'L':'Losses', \
                                                          'CG':'GamesCompleted', 'H':'HitsAgainst', 'HR':'HomerunsAllowed',\
                                                         'R':'RunsAllowed', 'BB':'Walks', 'SH':'SHAgainst', \
                                                          'HBP':'BattersHBP', 'GIDP':'GIDPAllowed', 'SH':'SHAllowed', \
                                                         'IBB':'IBBAllowed', 'SF':'SFAllowed', 'GS':'GamesStarted'})

# Renaming Fielding table columns to help with merging
fielding_historical = fielding_historical.rename(columns={'SB':'CatcherSB', 'G':'GamesPlayed'})
# Cleanup to fill NANs with 0's to prevent merge from failing
# for metrics like Wild Pitches and Zone Rating
fielding_historical = fielding_historical.fillna(0)

# The allstar table will be cleaned up to represent a couple things nicely as follows:
#     1) We will be using GP column as gameNum attmpts to account\
#         for years with 2 allstar games between 1959-mid 60's
#     2) We're setting GP to 0 where the player was selected to play but didn't\
#         actually play. This is to prevent the player from impacting analysis
allstar_historical = allstar_historical.rename(columns={'GP':'GamesPlayedAS', 'startingPos':'startingPosAS'})
allstar_historical = allstar_historical[['yearID','playerID','teamID','lgID','GamesPlayedAS','startingPosAS']]
allstar_historical.ix[pd.isnull(allstar_historical['startingPosAS']), 'GamesPlayedAS'] = 0
allstar_historical['startingPosAS'] = allstar_historical['startingPosAS'].fillna(0)

# We will need this for fixing salaries on stitched player profiles
years = salaryrange_historical.yearID.unique()
salaryrange_historical = salaryrange_historical.set_index(['yearID'])

In [12]:
# Verify that Column adjustments worked
print batting_historical[batting_historical['yearID']>2015].head(n=1)
print pitching_historical[pitching_historical['yearID']>2015].head(n=1)
print fielding_historical[fielding_historical['yearID']>2015].head(n=1)
print allstar_historical[allstar_historical['yearID']>2015].head(n=1)
print salaries_historical[salaries_historical['yearID']>2015].head(n=1)


        playerID  yearID  stint teamID lgID  GamesPlayed  AB  Runs  Hits  \
101333  abadfe01    2016      1    MIN   AL           39   1     0     0   

        Doubles  ...   RBI  SB  CSBatting  BaseOnBalls  Strikeouts  IBB  HBP  \
101333        0  ...     0   0          0            0           1    0    0   

        SH  SF  GIDP  
101333   0   0     0  

[1 rows x 22 columns]
       playerID  yearID  stint teamID lgID  Wins  Losses  GamesPlayed  \
44139  abadfe01    2016      1    MIN   AL     1       4           39   

       GamesStarted  GamesCompleted     ...       IBBAllowed  WP  BattersHBP  \
44139             0               0     ...                2   0           0   

       BK  BFP  GF  RunsAllowed  SHAllowed  SFAllowed  GIDPAllowed  
44139   1  138   8           11          0          1            6  

[1 rows x 30 columns]
        playerID  yearID  stint teamID lgID POS  GamesPlayed  GS  InnOuts  PO  \
134862  abadfe01    2016      1    MIN   AL   P           39   0      102   0   

        A  E  DP  PB  WP  CatcherSB  CS  ZR  
134862  3  0   1   0   0          0   0   0  
      yearID   playerID teamID lgID  GamesPlayedAS  startingPosAS
5069    2016  altuvjo01    HOU   AL              1              4
       yearID teamID lgID   playerID  salary
25575    2016    ARI   NL  ahmedni01  521600

In [13]:
# Here we're going to make a decision to only use data from 
# 1967 onwards. This is primarily because structured data for
# salaries is available only from that year onwards

batting_historical = batting_historical[batting_historical['yearID'] > 1966]
pitching_historical = pitching_historical[pitching_historical['yearID'] > 1966]
fielding_historical = fielding_historical[fielding_historical['yearID'] > 1966]
allstar_historical = allstar_historical[allstar_historical['yearID'] > 1966]
salaries_historical = salaries_historical[salaries_historical['yearID'] > 1966]

# Begin stitching together stats:
# 1) Merge Batting and Pitching
player_profile = pd.merge(batting_historical, pitching_historical, on=['yearID','playerID','teamID','lgID','GamesPlayed','stint'], how='inner')

In [14]:
# 2) Merge Step 1. with Fielding
player_profile = pd.merge(player_profile, fielding_historical, on=['yearID','playerID','teamID','lgID','GamesPlayed','stint'], how='inner')

# Cleaning up merge
player_profile['WildPitches'] = player_profile['WP_x'].add(player_profile['WP_y']).fillna(0)
del player_profile['WP_x']
del player_profile['WP_y']

In [15]:
# 3) Merge Step 2. with AllStar data
player_profile = pd.merge(player_profile, allstar_historical, on=['yearID','playerID','teamID','lgID'], how='left')
player_profile[['startingPosAS','GamesPlayedAS']] = player_profile[['startingPosAS','GamesPlayedAS']].fillna(0)

In [16]:
# 4) Merge Step 3. with Salary data
player_profile = pd.merge(player_profile, salaries_historical, on=['yearID','playerID','teamID','lgID'], how='left')

# It turns out that we don't have salary data
# for some players in the MLB. However, we do 
# have minimum and average salary data for the
# MLB from 1967 onwards though. For players with
# no salary data, we will be using the minimum
# salary for that year

for year in years:
    swap = player_profile[player_profile['yearID']==year]
    swap.ix[pd.isnull(swap['salary']), 'salary'] = salaryrange_historical.loc[year]['minSalary'] 
    player_profile[player_profile['yearID']==year] = swap

In [17]:
print player_profile.columns
print player_profile.head(n=5)


Index([u'playerID', u'yearID', u'stint', u'teamID', u'lgID', u'GamesPlayed',
       u'AB', u'Runs', u'Hits', u'Doubles', u'Triples', u'HomerunsScored',
       u'RBI', u'SB', u'CSBatting', u'BaseOnBalls', u'Strikeouts', u'IBB',
       u'HBP', u'SH', u'SF', u'GIDP', u'Wins', u'Losses', u'GamesStarted',
       u'GamesCompleted', u'SHO', u'SV', u'IPouts', u'HitsAgainst', u'ER',
       u'HomerunsAllowed', u'Walks', u'SO', u'BAOpp', u'ERA', u'IBBAllowed',
       u'BattersHBP', u'BK', u'BFP', u'GF', u'RunsAllowed', u'SHAllowed',
       u'SFAllowed', u'GIDPAllowed', u'POS', u'GS', u'InnOuts', u'PO', u'A',
       u'E', u'DP', u'PB', u'CatcherSB', u'CS', u'ZR', u'WildPitches',
       u'GamesPlayedAS', u'startingPosAS', u'salary'],
      dtype='object')
    playerID  yearID  stint teamID lgID  GamesPlayed  AB  Runs  Hits  Doubles  \
0  abernte02    1967      1    CIN   NL           70  17     0     1        0   
1  adamsmi01    1967      1    BAL   AL            3   2     1     1        0   
2  aguirha01    1967      1    DET   AL           31   2     0     1        0   
3   akerja01    1967      1    KC1   AL           57   8     0     1        0   
4  allenbo04    1967      1    CLE   AL           47   0     0     0        0   

    ...    E  DP  PB  CatcherSB  CS  ZR  WildPitches  GamesPlayedAS  \
0   ...    4   0   0          0   0   0            1              0   
1   ...    0   0   0          0   0   0            1              0   
2   ...    1   0   0          0   0   0            1              0   
3   ...    0   3   0          0   0   0            1              0   
4   ...    0   0   0          0   0   0            4              0   

   startingPosAS  salary  
0              0    6000  
1              0    6000  
2              0    6000  
3              0    6000  
4              0    6000  

[5 rows x 60 columns]

In [ ]:


In [ ]: