In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

filename = 'baseball_core/TeamsFranchises.csv'
franchises = pd.read_csv(filename)

filename1 = 'baseball_core/Teams.csv'
team_perf_historical = pd.read_csv(filename1)

# Player Specific Statistics
filename2 = 'baseball_core/Salaries.csv'
salaries_historical = pd.read_csv(filename2)

filename3 = 'baseball_core/Batting.csv'
batting_historical = pd.read_csv(filename3)

filename4 = 'baseball_core/Pitching.csv'
pitching_historical = pd.read_csv(filename4)

filename5 = 'baseball_core/Fielding.csv'
fielding_historical = pd.read_csv(filename5)

filename6 = 'baseball_core/Master.csv'
master = pd.read_csv(filename6)

filename7 = 'baseball_core/AllstarFull.csv'
allstar_historical = pd.read_csv(filename7)

In [2]:
# ----- Extract Player IDs for Debugging -------------
# temp = master[master['nameLast']=='Sale']
# print temp[temp['nameGiven']=='Christopher']
# ----- Results ----- 
# David Ortiz - ortizda01
# Chris Sale - salech01

# --------- Does the Data Make Sense? ----------------
temp_b = batting_historical[batting_historical['playerID']=='salech01']
# print temp_b[temp_b['yearID']==2016]
temp_p = pitching_historical[pitching_historical['playerID']=='salech01']
# print temp_p[['HR','yearID','G', 'H','WP']]
# print temp_p[temp_p['yearID']==2016]
temp_f = fielding_historical[fielding_historical['playerID']=='salech01']
# print temp_f[temp_f['yearID']==2016]


        playerID  yearID  stint teamID lgID POS   G  GS  InnOuts  PO   A  E  \
136451  salech01    2016      1    CHA   AL   P  32  32      680  12  19  0   

        DP  PB  WP  SB  CS  ZR  
136451   1 NaN NaN NaN NaN NaN  

In [3]:
# This section is going to be used to clean-up
# column names before creating the stitched 
# player profiles.

batting_historical = batting_historical.rename(columns={'R':'Runs', 'H':'Hits', 'HR':'HomerunsScored', \
                                                        'G':'GamesPlayed','SO':'Strikeouts', 'BB':'BaseOnBalls',\
                                                       '2B':'Doubles', '3B':'Triples','CS':'CSBatting'})

pitching_historical = pitching_historical.rename(columns={'G':'GamesPlayed', 'W':'Wins', 'L':'Losses', \
                                                          'CG':'GamesCompleted', 'H':'HitsAgainst', 'HR':'HomerunsAllowed',\
                                                         'R':'RunsAllowed', 'BB':'Walks', 'SH':'SHAgainst', \
                                                          'HBP':'BattersHBP', 'GIDP':'GIDPAllowed', 'SH':'SHAllowed', \
                                                         'IBB':'IBBAllowed', 'SF':'SFAllowed', 'GS':'GamesStarted'})

fielding_historical = fielding_historical.rename(columns={'SB':'CatcherSB', 'G':'GamesPlayed'})
# Cleanup to fill NANs with 0's to prevent merge from failing
# for metrics like Wild Pitches and Zone Rating
fielding_historical = fielding_historical.fillna(0)

allstar_historical = allstar_historical.rename(columns={'gameNum':'GamesPlayedAS'})

In [ ]:
# Verify that Column adjustments worked
print batting_historical[batting_historical['yearID']>2015].head(n=1)
print pitching_historical[pitching_historical['yearID']>2015].head(n=1)
print fielding_historical[fielding_historical['yearID']>2015].head(n=1)
print allstar_historical[allstar_historical['yearID']>2015].head(n=1)

In [5]:
# temp_sal = salaries_historical[salaries_historical['yearID'] > 2015]
temp_bat = batting_historical[batting_historical['yearID'] > 2015]
temp_pitch = pitching_historical[pitching_historical['yearID'] > 2015]
temp_field = fielding_historical[fielding_historical['yearID'] > 2015]

temp1 = pd.merge(temp_bat, temp_pitch, on=['yearID','playerID','teamID','lgID','GamesPlayed','stint'], how='inner')

In [6]:
temp2 = pd.merge(temp1, temp_field, on=['yearID','playerID','teamID','lgID','GamesPlayed','stint'], how='inner')

# Cleaning up merge
temp2['WildPitches'] = temp2['WP_x'].add(temp2['WP_y']).fillna(0)
del temp2['WP_x']
del temp2['WP_y']

# Add all star column to bat+pitch+field table
temp2['isAllStar'] = 0
# print temp2.head(n=2)
print


   playerID  yearID  stint teamID lgID  GamesPlayed  AB  Runs  Hits  Doubles  \
0  abadfe01    2016      1    MIN   AL           39   1     0     0        0   
1  abadfe01    2016      2    BOS   AL           18   0     0     0        0   

     ...      PO  A  E  DP  PB  CatcherSB  CS  ZR  WildPitches  isAllStar  
0    ...       0  3  0   1   0          0   0   0            0          0  
1    ...       0  1  0   0   0          0   0   0            1          0  

[2 rows x 58 columns]

In [ ]: