In [1]:

    
import pandas as pd
import numpy as np

The Data

The data comes from the MLB PitchFX dataset. It's publicly available and is updated very frequently.

I used mlb_terminal to collect 2 months worth of data from the 2013 season. You can see it in the bash script scrape-mlb.sh.



In [4]:

    
! open http://gd2.mlb.com/components/game/mlb/year_2012/month_06/day_01/gid_2012_06_01_arimlb_sdnmlb_1/inning/inning_2.xml



In [30]:

    
df = pd.read_csv("./baseball-pitches.csv")
df.head()









    Out[30]:




<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 36 columns):
pitch_time         5  non-null values
inning             5  non-null values
top_or_bottom      5  non-null values
pitcher_name       5  non-null values
hitter_name        5  non-null values
pitch_type         5  non-null values
x                  5  non-null values
y                  5  non-null values
start_speed        5  non-null values
end_speed          5  non-null values
sz_top             5  non-null values
sz_bottom          5  non-null values
pfx_x              5  non-null values
pfx_z              5  non-null values
px                 5  non-null values
pz                 5  non-null values
x0                 5  non-null values
y0                 5  non-null values
ax                 5  non-null values
ay                 5  non-null values
az                 5  non-null values
z0                 5  non-null values
vx0                5  non-null values
vy0                5  non-null values
vz0                5  non-null values
break_y            5  non-null values
break_angle        5  non-null values
break_length       5  non-null values
pitch_name         5  non-null values
type_confidence    5  non-null values
zone               5  non-null values
nasty              5  non-null values
spin_dir           5  non-null values
spin_rate          5  non-null values
comments           0  non-null values
unk                0  non-null values
dtypes: float64(28), int64(1), object(7)

Let's limit this to a few less columns.

Cleaning the pitch_name column.



In [31]:

    
lu = """FA,Fastball
FF,Fastball
FT,Fastball
FC,Cut fastball
FS,Fastball (sinker|split-fingered)
SI,Fastball (sinker|split-fingered)
SF,Fastball (sinker|split-fingered)
SL,Slider
CH,Changeup
CB,Curveball
CU,Curveball
KC,Curveball
KN,Knuckleball
EP,Eephus
UN,Unidentified
XX,Unidentified
PO,Pitch out
FO,Pitch out""".split('\n')



In [32]:

    
for row in lu:
    row = row.split(',')
    abbrv, name = row[0], row[1]
    df['pitch_name'] = df['pitch_name'].replace(abbrv, name)
df['pitch_name'] = df['pitch_name']
# df = df[df.pitch_name.isin(df.pitch_name.value_counts().head(8).index)]



In [33]:

    
df.ix[:,:10].head()









    Out[33]:






  
    
      
      pitch_time
      inning
      top_or_bottom
      pitcher_name
      hitter_name
      pitch_type
      x
      y
      start_speed
      end_speed
    
  
  
    
      0
       2013-10-01 20:07:43 -0400
       1
       Top
       Francisco Liriano
       Shin-Soo Choo
       B
        78.97
       164.92
       93.2
       85.3
    
    
      1
       2013-10-01 20:07:57 -0400
       1
       Top
       Francisco Liriano
       Shin-Soo Choo
       S
        82.40
       131.24
       93.4
       85.6
    
    
      2
       2013-10-01 20:08:12 -0400
       1
       Top
       Francisco Liriano
       Shin-Soo Choo
       S
        96.14
       161.47
       89.1
       82.8
    
    
      3
       2013-10-01 20:08:31 -0400
       1
       Top
       Francisco Liriano
       Shin-Soo Choo
       S
       106.44
       163.19
       90.0
       83.3
    
    
      4
       2013-10-01 20:09:09 -0400
       1
       Top
       Francisco Liriano
        Ryan Ludwick
       B
       163.95
       194.28
       87.7
       81.6



In [34]:

    
df.ix[:,25:].head()









    Out[34]:






  
    
      
      break_y
      break_angle
      break_length
      pitch_name
      type_confidence
      zone
      nasty
      spin_dir
      spin_rate
      comments
      unk
    
  
  
    
      0
       23.8
      -41.3
       6.3
       Fastball
       0.894
        9
       65
       120.583
       2541.561
       NaN
      NaN
    
    
      1
       23.8
      -44.6
       5.4
       Fastball
       0.895
       12
       62
       128.371
       2589.087
       NaN
      NaN
    
    
      2
       23.8
      -10.4
       5.8
         Slider
       0.931
        8
       32
       148.073
       1133.227
       NaN
      NaN
    
    
      3
       23.8
        2.6
       6.8
         Slider
       0.926
        8
       34
       189.793
        430.593
       NaN
      NaN
    
    
      4
       23.8
       -3.1
       7.3
         Slider
       0.915
       13
       55
       140.567
        482.080
       NaN
      NaN



In [35]:

    
df.ix[:,25:].head()









    Out[35]:






  
    
      
      break_y
      break_angle
      break_length
      pitch_name
      type_confidence
      zone
      nasty
      spin_dir
      spin_rate
      comments
      unk
    
  
  
    
      0
       23.8
      -41.3
       6.3
       Fastball
       0.894
        9
       65
       120.583
       2541.561
       NaN
      NaN
    
    
      1
       23.8
      -44.6
       5.4
       Fastball
       0.895
       12
       62
       128.371
       2589.087
       NaN
      NaN
    
    
      2
       23.8
      -10.4
       5.8
         Slider
       0.931
        8
       32
       148.073
       1133.227
       NaN
      NaN
    
    
      3
       23.8
        2.6
       6.8
         Slider
       0.926
        8
       34
       189.793
        430.593
       NaN
      NaN
    
    
      4
       23.8
       -3.1
       7.3
         Slider
       0.915
       13
       55
       140.567
        482.080
       NaN
      NaN



In [37]:

    
df = df[df.pitch_name.isin(["IN", "Pitch out", "SC"])==False]
df = df[df.pitch_name.isnull()==False]



In [38]:

    
df.to_csv("./baseball-pitches-clean.csv", index=False)

	pitch_time	inning	top_or_bottom	pitcher_name	hitter_name	pitch_type	x	y	start_speed	end_speed
0	2013-10-01 20:07:43 -0400	1	Top	Francisco Liriano	Shin-Soo Choo	B	78.97	164.92	93.2	85.3
1	2013-10-01 20:07:57 -0400	1	Top	Francisco Liriano	Shin-Soo Choo	S	82.40	131.24	93.4	85.6
2	2013-10-01 20:08:12 -0400	1	Top	Francisco Liriano	Shin-Soo Choo	S	96.14	161.47	89.1	82.8
3	2013-10-01 20:08:31 -0400	1	Top	Francisco Liriano	Shin-Soo Choo	S	106.44	163.19	90.0	83.3
4	2013-10-01 20:09:09 -0400	1	Top	Francisco Liriano	Ryan Ludwick	B	163.95	194.28	87.7	81.6

	break_y	break_angle	break_length	pitch_name	type_confidence	zone	nasty	spin_dir	spin_rate	comments	unk
0	23.8	-41.3	6.3	Fastball	0.894	9	65	120.583	2541.561	NaN	NaN
1	23.8	-44.6	5.4	Fastball	0.895	12	62	128.371	2589.087	NaN	NaN
2	23.8	-10.4	5.8	Slider	0.931	8	32	148.073	1133.227	NaN	NaN
3	23.8	2.6	6.8	Slider	0.926	8	34	189.793	430.593	NaN	NaN
4	23.8	-3.1	7.3	Slider	0.915	13	55	140.567	482.080	NaN	NaN