In [1]:
from ggplot import *
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/Users/glamp/repos/yhat/blog/code-exp/22/baseball-data.csv")
df = pd.read_table("/Users/glamp/repos/yhat/blog/code-exp/22/pitches.tsv")
df.ix[:,:15].head()


Out[2]:
pitch_time inning top_or_bottom pitcher_name hitter_name pitch_type x y start_speed end_speed sz_top sz_bottom pfx_x pfx_z px
0 2013-10-01 20:07:43 -0400 1 Top Francisco Liriano Shin-Soo Choo B 78.97 164.92 93.2 85.3 3.10 1.53 11.01 6.47 0.628
1 2013-10-01 20:07:57 -0400 1 Top Francisco Liriano Shin-Soo Choo S 82.40 131.24 93.4 85.6 3.06 1.56 10.14 7.99 0.545
2 2013-10-01 20:08:12 -0400 1 Top Francisco Liriano Shin-Soo Choo S 96.14 161.47 89.1 82.8 3.25 1.53 3.11 4.95 0.120
3 2013-10-01 20:08:31 -0400 1 Top Francisco Liriano Shin-Soo Choo S 106.44 163.19 90.0 83.3 3.25 1.53 -0.38 2.15 -0.229
4 2013-10-01 20:09:09 -0400 1 Top Francisco Liriano Ryan Ludwick B 163.95 194.28 87.7 81.6 3.62 1.78 1.62 1.93 -1.917

In [3]:
df.ix[:,16:].head()


Out[3]:
x0 y0 ax ay az z0 vx0 vy0 vz0 break_y break_angle break_length pitch_type.1 type_confidence zone nasty spin_dir spin_rate comments unk
0 1.757 50 5.472 -6.862 -136.338 -6.799 20.562 32.261 -20.022 23.8 -41.3 6.3 FT 0.894 9 65 120.583 2541.561 NaN NaN
1 1.711 50 5.650 -6.693 -136.724 -3.800 19.130 31.090 -17.028 23.8 -44.6 5.4 FT 0.895 12 62 128.371 2589.087 NaN NaN
2 1.559 50 5.792 -4.763 -130.525 -5.690 5.392 26.060 -23.521 23.8 -10.4 5.8 SL 0.931 8 32 148.073 1133.227 NaN NaN
3 1.172 50 5.832 -3.519 -131.909 -5.404 -0.666 28.298 -28.316 23.8 2.6 6.8 SL 0.926 8 34 189.793 430.593 NaN NaN
4 0.194 50 5.578 -5.886 -128.348 -7.342 2.709 26.078 -28.880 23.8 -3.1 7.3 SL 0.915 13 55 140.567 482.080 NaN NaN

In [4]:
df['pitch_type'].head()


Out[4]:
0    B
1    S
2    S
3    S
4    B
Name: pitch_type, dtype: object

In [5]:
df['f_pitch_result'] = np.where(df['pitch_type']=="B", "ball",
                              np.where(df['pitch_type']=="S", "strike",
                                       np.where(df['pitch_type']=="X", "hit", None)))

In [6]:
lu = """FA,Fastball
FF,Fastball
FT,Fastball
FC,Cut fastball
FS,Fastball (sinker|split-fingered)
SI,Fastball (sinker|split-fingered)
SF,Fastball (sinker|split-fingered)
SL,Slider
CH,Changeup
CB,Curveball
CU,Curveball
KC,Curveball
KN,Knuckleball
EP,Eephus
UN,Unidentified
XX,Unidentified
PO,Pitch out
FO,Pitch out""".split('\n')

In [7]:
for row in lu:
    row = row.split(',')
    abbrv, name = row[0], row[1]
    df['pitch_type.1'] = df['pitch_type.1'].replace(abbrv, name)
df['f_pitch_type'] = df['pitch_type.1']
df = df[df.f_pitch_type.isin(df.f_pitch_type.value_counts().head(8).index)]

In [ ]:
df.pi

In [8]:
ggplot(aes(x='start_speed', color='f_pitch_type'), data=df) +\
    geom_density() +\
    scale_color_brewer(type='qual')


Out[8]:
<ggplot: (285549633)>

In [9]:
ggplot(aes(x='start_speed', color='pitch_type'), data=df) +\
    geom_density() +\
    scale_color_brewer(type='qual') +\
    facet_wrap("f_pitch_type")


Out[9]:
<ggplot: (284992757)>

In [10]:
df.ix[:,:15].head()


Out[10]:
pitch_time inning top_or_bottom pitcher_name hitter_name pitch_type x y start_speed end_speed sz_top sz_bottom pfx_x pfx_z px
0 2013-10-01 20:07:43 -0400 1 Top Francisco Liriano Shin-Soo Choo B 78.97 164.92 93.2 85.3 3.10 1.53 11.01 6.47 0.628
1 2013-10-01 20:07:57 -0400 1 Top Francisco Liriano Shin-Soo Choo S 82.40 131.24 93.4 85.6 3.06 1.56 10.14 7.99 0.545
2 2013-10-01 20:08:12 -0400 1 Top Francisco Liriano Shin-Soo Choo S 96.14 161.47 89.1 82.8 3.25 1.53 3.11 4.95 0.120
3 2013-10-01 20:08:31 -0400 1 Top Francisco Liriano Shin-Soo Choo S 106.44 163.19 90.0 83.3 3.25 1.53 -0.38 2.15 -0.229
4 2013-10-01 20:09:09 -0400 1 Top Francisco Liriano Ryan Ludwick B 163.95 194.28 87.7 81.6 3.62 1.78 1.62 1.93 -1.917

In [11]:
df.ix[:,16:30].head()


Out[11]:
x0 y0 ax ay az z0 vx0 vy0 vz0 break_y break_angle break_length pitch_type.1 type_confidence
0 1.757 50 5.472 -6.862 -136.338 -6.799 20.562 32.261 -20.022 23.8 -41.3 6.3 Fastball 0.894
1 1.711 50 5.650 -6.693 -136.724 -3.800 19.130 31.090 -17.028 23.8 -44.6 5.4 Fastball 0.895
2 1.559 50 5.792 -4.763 -130.525 -5.690 5.392 26.060 -23.521 23.8 -10.4 5.8 Slider 0.931
3 1.172 50 5.832 -3.519 -131.909 -5.404 -0.666 28.298 -28.316 23.8 2.6 6.8 Slider 0.926
4 0.194 50 5.578 -5.886 -128.348 -7.342 2.709 26.078 -28.880 23.8 -3.1 7.3 Slider 0.915

In [12]:
ggplot(aes('px', 'pz'), data=df) +\
    geom_jitter() +\
    geom_hline(color='red', yintercept=[df.sz_top.mean(), df.sz_bottom.mean()]) +\
    geom_vline(color='red', xintercept=[-1, 1]) +\
    xlim(-3, 3) + ylim(0, 6) +\
    coord_equal()


Out[12]:
<ggplot: (285205393)>

In [13]:
df.pitcher_name.value_counts().head(10)


Out[13]:
David Price         762
Justin Verlander    755
Chris Tillman       733
Andy Pettitte       718
Ubaldo Jimenez      698
Yu Darvish          695
Jason Vargas        691
Wade Miley          677
Jon Lester          674
J.A. Happ           672
dtype: int64

In [14]:
pitcher = df[df.pitcher_name=='David Price']

In [15]:
ggplot(aes(x='inning', y='start_speed'), data=pitcher) +\
    geom_jitter()


Out[15]:
<ggplot: (284984901)>

In [17]:
ggplot(aes(x='inning', y='start_speed', color='f_pitch_type'), data=pitcher) +\
    geom_jitter()


Out[17]:
<ggplot: (285250425)>

In [18]:
ggplot(aes(x='inning', y='start_speed'), data=pitcher) +\
    geom_jitter() +\
    facet_wrap("f_pitch_type")


Out[18]:
<ggplot: (285279221)>

In [19]:
ggplot(aes(x='inning', y='start_speed'), data=pitcher) +\
    geom_jitter() +\
    stat_smooth(method='lm', color='blue') +\
    facet_wrap("f_pitch_type")


Out[19]:
<ggplot: (285735221)>

In [20]:
df['pitch_count'] = 1
df['pitch_count'] = df.groupby("pitcher_name").apply(lambda x: np.cumsum(x['pitch_count']))

In [21]:
pitcher = df[df.pitcher_name=='C.J. Wilson']
print len(pitcher)
pitcher[['pitcher_name', 'pitch_count']].head()


653
Out[21]:
pitcher_name pitch_count
10972 C.J. Wilson 1
10973 C.J. Wilson 2
10974 C.J. Wilson 3
10975 C.J. Wilson 4
10976 C.J. Wilson 5

In [22]:
ggplot(aes(x='pitch_count', y='start_speed'), data=pitcher) +\
    geom_point()


Out[22]:
<ggplot: (285105337)>

In [28]:
ggplot(aes(x='pitch_count', y='start_speed', color='f_pitch_type'), data=pitcher) +\
    geom_point(alpha=0.3) +\
    stat_smooth(se=False) +\
    scale_color_brewer(type='qual')


Out[28]:
<ggplot: (285256485)>

In [32]:
ggplot(aes(x='px', y='pz', color='pitch_type', shape='f_pitch_type'), data=pitcher) +\
    geom_jitter() +\
    geom_hline(yintercept=[pitcher.sz_bottom.mean(), pitcher.sz_top.mean()], color='red') +\
    geom_vline(xintercept=[-1, 1], color='red') +\
    coord_equal()


Out[32]:
<ggplot: (285122721)>

In [33]:
ggplot(aes(x='break_length'), data=pitcher) + geom_histogram(binwidth=1)


Out[33]:
<ggplot: (291470841)>

In [34]:
ggplot(aes(x='break_length', color='f_pitch_type'), data=pitcher) +\
    geom_histogram(binwidth=1)


Out[34]:
<ggplot: (285247893)>

In [35]:
ggplot(aes(x='break_length', fill='f_pitch_type'), data=pitcher) +\
    geom_histogram(binwidth=1)


Out[35]:
<ggplot: (285141541)>

In [36]:
ggplot(aes(x='break_length',fill='f_pitch_type'), data=pitcher) +\
    geom_histogram(binwidth=1, alpha=0.3, color='black')


Out[36]:
<ggplot: (285550089)>

In [37]:
df_agg = df.groupby(['start_speed', 'end_speed']).pitch_type.value_counts()

In [38]:
df_agg = df_agg.unstack().fillna(0)

In [39]:
df['start_speed_bucket'] = pd.cut(df.start_speed, range(45, 100, 5))
df['end_speed_bucket'] = pd.cut(df.start_speed, range(45, 100, 5))

In [40]:
ggplot(aes(x='start_speed_bucket'), data=df) + geom_bar()


Out[40]:
<ggplot: (285142629)>

In [41]:
ggplot(aes(x='start_speed_bucket'), data=df) + geom_bar() + facet_wrap("f_pitch_type")


/usr/local/Cellar/python/2.7.5/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ggplot-0.5.7-py2.7.egg/ggplot/ggplot.py:198: RuntimeWarning: Facetting is currently not supported with geom_bar. See
                    https://github.com/yhat/ggplot/issues/196 for more information
  warnings.warn(msg, RuntimeWarning)
Out[41]:
<ggplot: (285141701)>

In [42]:
ggplot(aes(x='x', y='y'), data=df) +\
    geom_point() +\
    coord_equal()


Out[42]:
<ggplot: (285276201)>

In [224]:
ggplot(aes(x='x', y='y', color='pitch_type'), data=df) +\
    geom_jitter() +\
    coord_equal()


Out[224]:
<ggplot: (275114721)>

In [43]:
df.hitter_name.value_counts().head(10)


Out[43]:
Mike Trout          624
Joey Votto          588
Christian Yelich    587
Carlos Santana      585
Matt Carpenter      574
Kyle Seager         559
Jayson Werth        554
Evan Longoria       554
Shin-Soo Choo       554
Brian Dozier        551
dtype: int64

In [44]:
hitter = df[df.hitter_name=='Mike Trout']

In [45]:
ggplot(aes(x='px', y='pz', color='pitch_type'), data=hitter) +\
    geom_point() +\
    geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red') +\
    geom_vline(xintercept=[-1, 1], color='red') +\
    xlim(-2, 2) + ylim(0, 4) +\
    coord_equal()


Out[45]:
<ggplot: (284987949)>

In [47]:
ggplot(aes(x='px', y='pz', color='f_pitch_type'), data=hitter) +\
    geom_point() +\
    geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red') +\
    geom_vline(xintercept=[-1, 1], color='red') +\
    xlim(-2, 2) + ylim(0, 4) +\
    coord_equal()


Out[47]:
<ggplot: (285108209)>

In [49]:
ggplot(aes(x='px', y='pz'), data=hitter) +\
    stat_bin2d() +\
    geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red') +\
    geom_vline(xintercept=[-1, 1], color='red') +\
    xlim(-2, 2) + ylim(0, 4) +\
    coord_equal()


Out[49]:
<ggplot: (285129233)>

In [50]:
ggplot(aes(x='px', y='pz'), data=hitter) +\
    stat_bin2d() +\
    geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red') +\
    geom_vline(xintercept=[-1, 1], color='red') +\
    xlim(-2, 2) + ylim(0, 4) +\
    facet_wrap("f_pitch_type", scales="fixed") +\
    coord_equal()


Out[50]:
<ggplot: (285290661)>

In [51]:
fastballs = df[df.f_pitch_type=='Fastball']

In [55]:
ggplot(aes(x='start_speed', y='end_speed', color='start_speed - end_speed'), data=fastballs.head(10000)) +\
    geom_point() +\
    scale_color_gradient()


Out[55]:
<ggplot: (285546549)>

In [73]:
ggplot(aes(x='start_speed', y='end_speed', color='start_speed - end_speed'), data=fastballs.head(10000)) +\
    geom_point() +\
    scale_color_gradient(low="blue", high="red") +\
    coord_equal()


Out[73]:
<ggplot: (290971149)>

In [256]:
ggplot(aes(x='x', y='y', color='start_speed'), data=fastballs) +\
    geom_point() +\
    scale_color_gradient(low='yellow', high='red') +\
    coord_equal()


Out[256]:
<ggplot: (274941157)>

In [259]:
ggplot(aes(x='start_speed'), data=df) +\
    geom_histogram()


binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Out[259]:
<ggplot: (275111153)>

In [262]:
ggplot(aes(x='pitch_type'), data=df) +\
    geom_bar()


Out[262]:
<ggplot: (278153613)>

In [264]:
ggplot(aes(x='f_pitch_type'), data=df) +\
    geom_bar()


Out[264]:
<ggplot: (275102617)>

In [271]:
ggplot(aes(x='f_pitch_type'), data=df) +\
    geom_bar()


Out[271]:
<ggplot: (276519045)>

In [279]:
ggplot(aes(x='start_speed', color='factor(inning)'), data=df[df.inning < 10]) +\
    geom_density() +\
    scale_color_brewer(type='seq')


Out[279]:
<ggplot: (275944537)>

In [ ]: