In [1]:
from ggplot import *
import pandas as pd
import numpy as np
In [2]:
df = pd.read_csv("/Users/glamp/repos/yhat/blog/code-exp/22/baseball-data.csv")
df = pd.read_table("/Users/glamp/repos/yhat/blog/code-exp/22/pitches.tsv")
df.ix[:,:15].head()
Out[2]:
In [3]:
df.ix[:,16:].head()
Out[3]:
In [4]:
df['pitch_type'].head()
Out[4]:
In [5]:
df['f_pitch_result'] = np.where(df['pitch_type']=="B", "ball",
np.where(df['pitch_type']=="S", "strike",
np.where(df['pitch_type']=="X", "hit", None)))
In [6]:
lu = """FA,Fastball
FF,Fastball
FT,Fastball
FC,Cut fastball
FS,Fastball (sinker|split-fingered)
SI,Fastball (sinker|split-fingered)
SF,Fastball (sinker|split-fingered)
SL,Slider
CH,Changeup
CB,Curveball
CU,Curveball
KC,Curveball
KN,Knuckleball
EP,Eephus
UN,Unidentified
XX,Unidentified
PO,Pitch out
FO,Pitch out""".split('\n')
In [7]:
for row in lu:
row = row.split(',')
abbrv, name = row[0], row[1]
df['pitch_type.1'] = df['pitch_type.1'].replace(abbrv, name)
df['f_pitch_type'] = df['pitch_type.1']
df = df[df.f_pitch_type.isin(df.f_pitch_type.value_counts().head(8).index)]
In [ ]:
df.pi
In [8]:
ggplot(aes(x='start_speed', color='f_pitch_type'), data=df) +\
geom_density() +\
scale_color_brewer(type='qual')
Out[8]:
In [9]:
ggplot(aes(x='start_speed', color='pitch_type'), data=df) +\
geom_density() +\
scale_color_brewer(type='qual') +\
facet_wrap("f_pitch_type")
Out[9]:
In [10]:
df.ix[:,:15].head()
Out[10]:
In [11]:
df.ix[:,16:30].head()
Out[11]:
In [12]:
ggplot(aes('px', 'pz'), data=df) +\
geom_jitter() +\
geom_hline(color='red', yintercept=[df.sz_top.mean(), df.sz_bottom.mean()]) +\
geom_vline(color='red', xintercept=[-1, 1]) +\
xlim(-3, 3) + ylim(0, 6) +\
coord_equal()
Out[12]:
In [13]:
df.pitcher_name.value_counts().head(10)
Out[13]:
In [14]:
pitcher = df[df.pitcher_name=='David Price']
In [15]:
ggplot(aes(x='inning', y='start_speed'), data=pitcher) +\
geom_jitter()
Out[15]:
In [17]:
ggplot(aes(x='inning', y='start_speed', color='f_pitch_type'), data=pitcher) +\
geom_jitter()
Out[17]:
In [18]:
ggplot(aes(x='inning', y='start_speed'), data=pitcher) +\
geom_jitter() +\
facet_wrap("f_pitch_type")
Out[18]:
In [19]:
ggplot(aes(x='inning', y='start_speed'), data=pitcher) +\
geom_jitter() +\
stat_smooth(method='lm', color='blue') +\
facet_wrap("f_pitch_type")
Out[19]:
In [20]:
df['pitch_count'] = 1
df['pitch_count'] = df.groupby("pitcher_name").apply(lambda x: np.cumsum(x['pitch_count']))
In [21]:
pitcher = df[df.pitcher_name=='C.J. Wilson']
print len(pitcher)
pitcher[['pitcher_name', 'pitch_count']].head()
Out[21]:
In [22]:
ggplot(aes(x='pitch_count', y='start_speed'), data=pitcher) +\
geom_point()
Out[22]:
In [28]:
ggplot(aes(x='pitch_count', y='start_speed', color='f_pitch_type'), data=pitcher) +\
geom_point(alpha=0.3) +\
stat_smooth(se=False) +\
scale_color_brewer(type='qual')
Out[28]:
In [32]:
ggplot(aes(x='px', y='pz', color='pitch_type', shape='f_pitch_type'), data=pitcher) +\
geom_jitter() +\
geom_hline(yintercept=[pitcher.sz_bottom.mean(), pitcher.sz_top.mean()], color='red') +\
geom_vline(xintercept=[-1, 1], color='red') +\
coord_equal()
Out[32]:
In [33]:
ggplot(aes(x='break_length'), data=pitcher) + geom_histogram(binwidth=1)
Out[33]:
In [34]:
ggplot(aes(x='break_length', color='f_pitch_type'), data=pitcher) +\
geom_histogram(binwidth=1)
Out[34]:
In [35]:
ggplot(aes(x='break_length', fill='f_pitch_type'), data=pitcher) +\
geom_histogram(binwidth=1)
Out[35]:
In [36]:
ggplot(aes(x='break_length',fill='f_pitch_type'), data=pitcher) +\
geom_histogram(binwidth=1, alpha=0.3, color='black')
Out[36]:
In [37]:
df_agg = df.groupby(['start_speed', 'end_speed']).pitch_type.value_counts()
In [38]:
df_agg = df_agg.unstack().fillna(0)
In [39]:
df['start_speed_bucket'] = pd.cut(df.start_speed, range(45, 100, 5))
df['end_speed_bucket'] = pd.cut(df.start_speed, range(45, 100, 5))
In [40]:
ggplot(aes(x='start_speed_bucket'), data=df) + geom_bar()
Out[40]:
In [41]:
ggplot(aes(x='start_speed_bucket'), data=df) + geom_bar() + facet_wrap("f_pitch_type")
Out[41]:
In [42]:
ggplot(aes(x='x', y='y'), data=df) +\
geom_point() +\
coord_equal()
Out[42]:
In [224]:
ggplot(aes(x='x', y='y', color='pitch_type'), data=df) +\
geom_jitter() +\
coord_equal()
Out[224]:
In [43]:
df.hitter_name.value_counts().head(10)
Out[43]:
In [44]:
hitter = df[df.hitter_name=='Mike Trout']
In [45]:
ggplot(aes(x='px', y='pz', color='pitch_type'), data=hitter) +\
geom_point() +\
geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red') +\
geom_vline(xintercept=[-1, 1], color='red') +\
xlim(-2, 2) + ylim(0, 4) +\
coord_equal()
Out[45]:
In [47]:
ggplot(aes(x='px', y='pz', color='f_pitch_type'), data=hitter) +\
geom_point() +\
geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red') +\
geom_vline(xintercept=[-1, 1], color='red') +\
xlim(-2, 2) + ylim(0, 4) +\
coord_equal()
Out[47]:
In [49]:
ggplot(aes(x='px', y='pz'), data=hitter) +\
stat_bin2d() +\
geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red') +\
geom_vline(xintercept=[-1, 1], color='red') +\
xlim(-2, 2) + ylim(0, 4) +\
coord_equal()
Out[49]:
In [50]:
ggplot(aes(x='px', y='pz'), data=hitter) +\
stat_bin2d() +\
geom_hline(yintercept=[hitter.sz_bottom.mean(), hitter.sz_top.mean()], color='red') +\
geom_vline(xintercept=[-1, 1], color='red') +\
xlim(-2, 2) + ylim(0, 4) +\
facet_wrap("f_pitch_type", scales="fixed") +\
coord_equal()
Out[50]:
In [51]:
fastballs = df[df.f_pitch_type=='Fastball']
In [55]:
ggplot(aes(x='start_speed', y='end_speed', color='start_speed - end_speed'), data=fastballs.head(10000)) +\
geom_point() +\
scale_color_gradient()
Out[55]:
In [73]:
ggplot(aes(x='start_speed', y='end_speed', color='start_speed - end_speed'), data=fastballs.head(10000)) +\
geom_point() +\
scale_color_gradient(low="blue", high="red") +\
coord_equal()
Out[73]:
In [256]:
ggplot(aes(x='x', y='y', color='start_speed'), data=fastballs) +\
geom_point() +\
scale_color_gradient(low='yellow', high='red') +\
coord_equal()
Out[256]:
In [259]:
ggplot(aes(x='start_speed'), data=df) +\
geom_histogram()
Out[259]:
In [262]:
ggplot(aes(x='pitch_type'), data=df) +\
geom_bar()
Out[262]:
In [264]:
ggplot(aes(x='f_pitch_type'), data=df) +\
geom_bar()
Out[264]:
In [271]:
ggplot(aes(x='f_pitch_type'), data=df) +\
geom_bar()
Out[271]:
In [279]:
ggplot(aes(x='start_speed', color='factor(inning)'), data=df[df.inning < 10]) +\
geom_density() +\
scale_color_brewer(type='seq')
Out[279]:
In [ ]: