In [1]:
import pandas as pd
import numpy as np
The data comes from the MLB PitchFX dataset. It's publicly available and is updated very frequently.
I used mlb_terminal to collect 2 months worth of data from the 2013 season. You can see it in the bash script scrape-mlb.sh.
In [4]:
! open http://gd2.mlb.com/components/game/mlb/year_2012/month_06/day_01/gid_2012_06_01_arimlb_sdnmlb_1/inning/inning_2.xml
In [30]:
df = pd.read_csv("./baseball-pitches.csv")
df.head()
Out[30]:
Let's limit this to a few less columns.
Cleaning the pitch_name column.
In [31]:
lu = """FA,Fastball
FF,Fastball
FT,Fastball
FC,Cut fastball
FS,Fastball (sinker|split-fingered)
SI,Fastball (sinker|split-fingered)
SF,Fastball (sinker|split-fingered)
SL,Slider
CH,Changeup
CB,Curveball
CU,Curveball
KC,Curveball
KN,Knuckleball
EP,Eephus
UN,Unidentified
XX,Unidentified
PO,Pitch out
FO,Pitch out""".split('\n')
In [32]:
for row in lu:
row = row.split(',')
abbrv, name = row[0], row[1]
df['pitch_name'] = df['pitch_name'].replace(abbrv, name)
df['pitch_name'] = df['pitch_name']
# df = df[df.pitch_name.isin(df.pitch_name.value_counts().head(8).index)]
In [33]:
df.ix[:,:10].head()
Out[33]:
In [34]:
df.ix[:,25:].head()
Out[34]:
In [35]:
df.ix[:,25:].head()
Out[35]:
In [37]:
df = df[df.pitch_name.isin(["IN", "Pitch out", "SC"])==False]
df = df[df.pitch_name.isnull()==False]
In [38]:
df.to_csv("./baseball-pitches-clean.csv", index=False)