In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

Data import

Assumes data is in DATA_DIR and includes the directory and file structure of data as downloaded from retrosheet.org.


In [2]:
DATA_DIR = '../data'

In [3]:
import glob

event_files = glob.glob("{}/*seve/*.EV*".format(DATA_DIR))

Constrain to years of interest


In [4]:
years = list(range(2010, 2016))

In [5]:
year_files = [f for f in event_files if int(f.split('/')[-1][:4]) in years]

Parse game information, including plays and lineup changes.


In [6]:
from io import StringIO

def parse_event_file(filename):
    game_info_io, game_play_io, lineup_io = str_io_list = [StringIO() for _ in range(3)]
    with open(filename) as f:
        game_id = ''
        lineup_id = 0
        new_game = True
        for line in f:
            line = line.rstrip('\n')
            if line.startswith('id'):
                game_id = line.split(',')[-1]
            elif line.startswith('start'):
                if not new_game:
                    new_game = True
                    lineup_id = 0
                lineup_io.write(','.join([game_id, str(lineup_id)] 
                                         + line.split(',')[1:]) + '\n')
            elif line.startswith('info'):
                game_info_io.write(','.join([game_id] + line.split(',')[1:]) + '\n')
            elif line.startswith('play'):
                game_play_io.write(','.join([game_id, str(lineup_id)] 
                                            + line.split(',')[1:]) + '\n')
            elif line.startswith('sub'):
                new_game = False
                lineup_id += 1
                lineup_io.write(','.join([game_id, str(lineup_id)] 
                                         + line.split(',')[1:]) + '\n')
                
    
    # "rewind" to the beginning of the StringIO object
    for str_io in str_io_list:
        str_io.seek(0)
    
    return (pd.read_csv(game_info_io, header=None, 
                    names=['Game_ID','Var','Value']).pivot('Game_ID','Var','Value'),
            pd.read_csv(game_play_io, header=None, index_col=False,
                    names=['Game_ID', 'Lineup_ID', 'Inning', 'Home', 'Retrosheet_ID', 
                           'Count', 'Pitches', 'Play']),
           pd.read_csv(lineup_io, header=None, index_col=False,
                      names=['Game_ID', 'Lineup_ID', 'Retrosheet_ID', 'Name', 'Home', 
                             'Order', 'Position']))

Parse all files


In [7]:
parsed_files = [parse_event_file(f) for f in year_files]

Concatenate game info, plays and lineup data


In [8]:
games = pd.concat([df[0] for df in parsed_files], ignore_index=True)
games.shape


Out[8]:
(14579, 27)

In [9]:
plays = pd.concat([df[1] for df in parsed_files], ignore_index=True)
plays.shape


Out[9]:
(1304164, 8)

In [10]:
plays.loc[301:320]


Out[10]:
Game_ID Lineup_ID Inning Home Retrosheet_ID Count Pitches Play
301 ANA201004080 2 7 0 hudso001 1 C+3X 43/G
302 ANA201004080 2 7 0 mauej001 21 BBCX 53/G
303 ANA201004080 2 7 1 aybae001 0 NaN NP
304 ANA201004080 3 7 1 aybae001 12 .LFFFBT K
305 ANA201004080 3 7 1 abreb001 22 CFBFBFX 8/F
306 ANA201004080 3 7 1 huntt001 2 CFFS K
307 ANA201004080 3 8 0 mornj001 0 NaN NP
308 ANA201004080 4 8 0 mornj001 32 .CBBBCFFB W
309 ANA201004080 4 8 0 cuddm001 10 BX S7/G.1-2
310 ANA201004080 4 8 0 thomj002 0 X HR/89/F.2-H;1-H
311 ANA201004080 4 8 0 kubej002 2 FCS K
312 ANA201004080 4 8 0 hardj003 21 BSBX S8/G
313 ANA201004080 4 8 0 harrb001 22 B*BFFFS K
314 ANA201004080 4 8 0 spand001 11 BCX 43/G
315 ANA201004080 4 8 1 matsh001 0 NaN NP
316 ANA201004080 5 8 1 matsh001 0 NaN NP
317 ANA201004080 6 8 1 matsh001 20 ..BBX S57/G+
318 ANA201004080 6 8 1 morak001 12 BCFS K
319 ANA201004080 6 8 1 rivej001 2 FCS K
320 ANA201004080 6 8 1 kendh001 32 BFBCFB>S K

In [11]:
lineups = pd.concat([df[2] for df in parsed_files], ignore_index=True)
lineups.shape


Out[11]:
(435719, 7)

Create hierarchical index for lineups


In [12]:
lineups_hi = lineups.set_index(['Game_ID', 'Lineup_ID', 'Home', 'Order'])
lineups_hi.head(25)


Out[12]:
Retrosheet_ID Name Position
Game_ID Lineup_ID Home Order
ANA201004050 0 0 1 spand001 Denard Span 8
2 hudso001 Orlando Hudson 4
3 mauej001 Joe Mauer 2
4 mornj001 Justin Morneau 3
5 cuddm001 Michael Cuddyer 9
6 kubej002 Jason Kubel 10
7 yound003 Delmon Young 7
8 hardj003 J.J. Hardy 6
9 puntn001 Nick Punto 5
0 bakes002 Scott Baker 1
1 1 aybae001 Erick Aybar 6
2 abreb001 Bobby Abreu 9
3 huntt001 Torii Hunter 8
4 matsh001 Hideki Matsui 10
5 morak001 Kendry Morales 3
6 rivej001 Juan Rivera 7
7 kendh001 Howard Kendrick 4
8 woodb003 Brandon Wood 5
9 mathj001 Jeff Mathis 2
0 weavj003 Jered Weaver 1
1 0 0 craij001 Jesse Crain 1
2 1 0 jepsk001 Kevin Jepsen 1
3 0 9 thomj002 Jim Thome 11
4 0 9 harrb001 Brendan Harris 5
5 0 0 mijaj001 Jose Mijares 1

Function for constructing a given lineup from lineup changes


In [13]:
def get_lineup(game_id, lineup_id, data=lineups_hi):
    game_data = data.loc[game_id]
    current_lineup = game_data.loc[0].copy()
    try:
        for l in range(lineup_id):
            lineup_change = game_data.loc[l+1]
            current_lineup.loc[lineup_change.index] = lineup_change
    except IndexError:
        print('Invalid lineup number', lineup_id)
        return None
    return current_lineup

For example, pick arbitrary play and reconstruct the lineup at the time:


In [14]:
plays.loc[24776]


Out[14]:
Game_ID          BAL201006220
Lineup_ID                   4
Inning                      8
Home                        0
Retrosheet_ID        stanm004
Count                      11
Pitches                  .CBX
Play                     S7/G
Name: 24776, dtype: object

In [15]:
get_lineup('BAL201006220', 4)


Out[15]:
Retrosheet_ID Name Position
Home Order
0 1 coghc001 Chris Coghlan 7
2 sancg001 Gaby Sanchez 3
3 ramih003 Hanley Ramirez 6
4 cantj001 Jorge Cantu 10
5 uggld001 Dan Uggla 4
6 rossc001 Cody Ross 8
7 stanm004 Mike Stanton 9
8 helmw001 Wes Helms 5
9 paulr001 Ronny Paulino 2
0 tankt001 Taylor Tankersley 1
1 1 pattc001 Corey Patterson 7
2 tejam001 Miguel Tejada 5
3 markn001 Nick Markakis 9
4 wiggt001 Ty Wigginton 3
5 scotl001 Luke Scott 10
6 jonea003 Adam Jones 8
7 wietm001 Matt Wieters 2
8 moors001 Scott Moore 4
9 lugoj001 Julio Lugo 6
0 berkj001 Jason Berken 1

Chadwick events table

Use cwevent to parse events files


In [52]:
fields = pd.read_csv('../data/fields.csv', index_col=0)

In [66]:
def cwevent(year):
    try:
        year_dir = '/'.join(glob.glob("{}/*seve/{}*.EV*".format(DATA_DIR, year))[0].split('/')[:-1])
    except:
        print('No data for', year)
        return
    !cd $year_dir; cwevent -y $year -q -f 0-96 $year*.EV* > tmp.csv
    df = pd.read_csv(year_dir+'/tmp.csv', header=None, names=fields.Header)
    !rm $year_dir/tmp.csv
    return df

In [67]:
events1920s = pd.concat([cwevent(y) for y in range(1921, 1930)])


/Users/fonnescj/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2885: DtypeWarning: Columns (7,65) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2885: DtypeWarning: Columns (7,64,65,85) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
No data for 1923
No data for 1924
No data for 1926
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2885: DtypeWarning: Columns (63,64,65,85) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2885: DtypeWarning: Columns (64,65,85) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
No data for 1928
No data for 1929

In [68]:
events1920s.shape


Out[68]:
(275829, 97)

Rosters

Parse roster files and create single table


In [19]:
roster_files = glob.glob("{}/*seve/*.ROS".format(DATA_DIR))

In [32]:
roster_files[-1].split('/')[-1][3:7]


Out[32]:
'2015'

In [39]:
def parse_roster_file(filename):
    
    df = pd.read_csv(filename, header=None, 
                     names=("Retrosheet_ID", "Last_Name", "First_Name",
                            "Bats", "Pitches", "Team", "Position"), 
                     na_values=['X'])
    df['Year'] = int(filename.split('/')[-1][3:7])
    return df

In [41]:
rosters = pd.concat([parse_roster_file(f) for f in roster_files])

In [42]:
rosters.shape


Out[42]:
(79421, 8)