In [1]:

    
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

Data import

Assumes data is in DATA_DIR and includes the directory and file structure of data as downloaded from retrosheet.org.



In [2]:

    
DATA_DIR = '../data'



In [3]:

    
import glob

event_files = glob.glob("{}/*seve/*.EV*".format(DATA_DIR))

Constrain to years of interest



In [4]:

    
years = list(range(2010, 2016))



In [5]:

    
year_files = [f for f in event_files if int(f.split('/')[-1][:4]) in years]

Parse game information, including plays and lineup changes.



In [6]:

    
from io import StringIO

def parse_event_file(filename):
    game_info_io, game_play_io, lineup_io = str_io_list = [StringIO() for _ in range(3)]
    with open(filename) as f:
        game_id = ''
        lineup_id = 0
        new_game = True
        for line in f:
            line = line.rstrip('\n')
            if line.startswith('id'):
                game_id = line.split(',')[-1]
            elif line.startswith('start'):
                if not new_game:
                    new_game = True
                    lineup_id = 0
                lineup_io.write(','.join([game_id, str(lineup_id)] 
                                         + line.split(',')[1:]) + '\n')
            elif line.startswith('info'):
                game_info_io.write(','.join([game_id] + line.split(',')[1:]) + '\n')
            elif line.startswith('play'):
                game_play_io.write(','.join([game_id, str(lineup_id)] 
                                            + line.split(',')[1:]) + '\n')
            elif line.startswith('sub'):
                new_game = False
                lineup_id += 1
                lineup_io.write(','.join([game_id, str(lineup_id)] 
                                         + line.split(',')[1:]) + '\n')
                
    
    # "rewind" to the beginning of the StringIO object
    for str_io in str_io_list:
        str_io.seek(0)
    
    return (pd.read_csv(game_info_io, header=None, 
                    names=['Game_ID','Var','Value']).pivot('Game_ID','Var','Value'),
            pd.read_csv(game_play_io, header=None, index_col=False,
                    names=['Game_ID', 'Lineup_ID', 'Inning', 'Home', 'Retrosheet_ID', 
                           'Count', 'Pitches', 'Play']),
           pd.read_csv(lineup_io, header=None, index_col=False,
                      names=['Game_ID', 'Lineup_ID', 'Retrosheet_ID', 'Name', 'Home', 
                             'Order', 'Position']))

Parse all files



In [7]:

    
parsed_files = [parse_event_file(f) for f in year_files]

Concatenate game info, plays and lineup data



In [8]:

    
games = pd.concat([df[0] for df in parsed_files], ignore_index=True)
games.shape









    Out[8]:





(14579, 27)



In [9]:

    
plays = pd.concat([df[1] for df in parsed_files], ignore_index=True)
plays.shape









    Out[9]:





(1304164, 8)



In [10]:

    
plays.loc[301:320]









    Out[10]:






  
    
      
      Game_ID
      Lineup_ID
      Inning
      Home
      Retrosheet_ID
      Count
      Pitches
      Play
    
  
  
    
      301
      ANA201004080
      2
      7
      0
      hudso001
      1
      C+3X
      43/G
    
    
      302
      ANA201004080
      2
      7
      0
      mauej001
      21
      BBCX
      53/G
    
    
      303
      ANA201004080
      2
      7
      1
      aybae001
      0
      NaN
      NP
    
    
      304
      ANA201004080
      3
      7
      1
      aybae001
      12
      .LFFFBT
      K
    
    
      305
      ANA201004080
      3
      7
      1
      abreb001
      22
      CFBFBFX
      8/F
    
    
      306
      ANA201004080
      3
      7
      1
      huntt001
      2
      CFFS
      K
    
    
      307
      ANA201004080
      3
      8
      0
      mornj001
      0
      NaN
      NP
    
    
      308
      ANA201004080
      4
      8
      0
      mornj001
      32
      .CBBBCFFB
      W
    
    
      309
      ANA201004080
      4
      8
      0
      cuddm001
      10
      BX
      S7/G.1-2
    
    
      310
      ANA201004080
      4
      8
      0
      thomj002
      0
      X
      HR/89/F.2-H;1-H
    
    
      311
      ANA201004080
      4
      8
      0
      kubej002
      2
      FCS
      K
    
    
      312
      ANA201004080
      4
      8
      0
      hardj003
      21
      BSBX
      S8/G
    
    
      313
      ANA201004080
      4
      8
      0
      harrb001
      22
      B*BFFFS
      K
    
    
      314
      ANA201004080
      4
      8
      0
      spand001
      11
      BCX
      43/G
    
    
      315
      ANA201004080
      4
      8
      1
      matsh001
      0
      NaN
      NP
    
    
      316
      ANA201004080
      5
      8
      1
      matsh001
      0
      NaN
      NP
    
    
      317
      ANA201004080
      6
      8
      1
      matsh001
      20
      ..BBX
      S57/G+
    
    
      318
      ANA201004080
      6
      8
      1
      morak001
      12
      BCFS
      K
    
    
      319
      ANA201004080
      6
      8
      1
      rivej001
      2
      FCS
      K
    
    
      320
      ANA201004080
      6
      8
      1
      kendh001
      32
      BFBCFB>S
      K



In [11]:

    
lineups = pd.concat([df[2] for df in parsed_files], ignore_index=True)
lineups.shape









    Out[11]:





(435719, 7)

Create hierarchical index for lineups



In [12]:

    
lineups_hi = lineups.set_index(['Game_ID', 'Lineup_ID', 'Home', 'Order'])
lineups_hi.head(25)









    Out[12]:






  
    
      
      
      
      
      Retrosheet_ID
      Name
      Position
    
    
      Game_ID
      Lineup_ID
      Home
      Order
      
      
      
    
  
  
    
      ANA201004050
      0
      0
      1
      spand001
      Denard Span
      8
    
    
      2
      hudso001
      Orlando Hudson
      4
    
    
      3
      mauej001
      Joe Mauer
      2
    
    
      4
      mornj001
      Justin Morneau
      3
    
    
      5
      cuddm001
      Michael Cuddyer
      9
    
    
      6
      kubej002
      Jason Kubel
      10
    
    
      7
      yound003
      Delmon Young
      7
    
    
      8
      hardj003
      J.J. Hardy
      6
    
    
      9
      puntn001
      Nick Punto
      5
    
    
      0
      bakes002
      Scott Baker
      1
    
    
      1
      1
      aybae001
      Erick Aybar
      6
    
    
      2
      abreb001
      Bobby Abreu
      9
    
    
      3
      huntt001
      Torii Hunter
      8
    
    
      4
      matsh001
      Hideki Matsui
      10
    
    
      5
      morak001
      Kendry Morales
      3
    
    
      6
      rivej001
      Juan Rivera
      7
    
    
      7
      kendh001
      Howard Kendrick
      4
    
    
      8
      woodb003
      Brandon Wood
      5
    
    
      9
      mathj001
      Jeff Mathis
      2
    
    
      0
      weavj003
      Jered Weaver
      1
    
    
      1
      0
      0
      craij001
      Jesse Crain
      1
    
    
      2
      1
      0
      jepsk001
      Kevin Jepsen
      1
    
    
      3
      0
      9
      thomj002
      Jim Thome
      11
    
    
      4
      0
      9
      harrb001
      Brendan Harris
      5
    
    
      5
      0
      0
      mijaj001
      Jose Mijares
      1

Function for constructing a given lineup from lineup changes



In [13]:

    
def get_lineup(game_id, lineup_id, data=lineups_hi):
    game_data = data.loc[game_id]
    current_lineup = game_data.loc[0].copy()
    try:
        for l in range(lineup_id):
            lineup_change = game_data.loc[l+1]
            current_lineup.loc[lineup_change.index] = lineup_change
    except IndexError:
        print('Invalid lineup number', lineup_id)
        return None
    return current_lineup

For example, pick arbitrary play and reconstruct the lineup at the time:



In [14]:

    
plays.loc[24776]









    Out[14]:





Game_ID          BAL201006220
Lineup_ID                   4
Inning                      8
Home                        0
Retrosheet_ID        stanm004
Count                      11
Pitches                  .CBX
Play                     S7/G
Name: 24776, dtype: object



In [15]:

    
get_lineup('BAL201006220', 4)









    Out[15]:






  
    
      
      
      Retrosheet_ID
      Name
      Position
    
    
      Home
      Order
      
      
      
    
  
  
    
      0
      1
      coghc001
      Chris Coghlan
      7
    
    
      2
      sancg001
      Gaby Sanchez
      3
    
    
      3
      ramih003
      Hanley Ramirez
      6
    
    
      4
      cantj001
      Jorge Cantu
      10
    
    
      5
      uggld001
      Dan Uggla
      4
    
    
      6
      rossc001
      Cody Ross
      8
    
    
      7
      stanm004
      Mike Stanton
      9
    
    
      8
      helmw001
      Wes Helms
      5
    
    
      9
      paulr001
      Ronny Paulino
      2
    
    
      0
      tankt001
      Taylor Tankersley
      1
    
    
      1
      1
      pattc001
      Corey Patterson
      7
    
    
      2
      tejam001
      Miguel Tejada
      5
    
    
      3
      markn001
      Nick Markakis
      9
    
    
      4
      wiggt001
      Ty Wigginton
      3
    
    
      5
      scotl001
      Luke Scott
      10
    
    
      6
      jonea003
      Adam Jones
      8
    
    
      7
      wietm001
      Matt Wieters
      2
    
    
      8
      moors001
      Scott Moore
      4
    
    
      9
      lugoj001
      Julio Lugo
      6
    
    
      0
      berkj001
      Jason Berken
      1

Chadwick events table

Use cwevent to parse events files



In [52]:

    
fields = pd.read_csv('../data/fields.csv', index_col=0)



In [66]:

    
def cwevent(year):
    try:
        year_dir = '/'.join(glob.glob("{}/*seve/{}*.EV*".format(DATA_DIR, year))[0].split('/')[:-1])
    except:
        print('No data for', year)
        return
    !cd $year_dir; cwevent -y $year -q -f 0-96 $year*.EV* > tmp.csv
    df = pd.read_csv(year_dir+'/tmp.csv', header=None, names=fields.Header)
    !rm $year_dir/tmp.csv
    return df



In [67]:

    
events1920s = pd.concat([cwevent(y) for y in range(1921, 1930)])









    



/Users/fonnescj/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2885: DtypeWarning: Columns (7,65) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2885: DtypeWarning: Columns (7,64,65,85) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)






    



No data for 1923
No data for 1924
No data for 1926






    



/Users/fonnescj/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2885: DtypeWarning: Columns (63,64,65,85) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2885: DtypeWarning: Columns (64,65,85) have mixed types. Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)






    



No data for 1928
No data for 1929



In [68]:

    
events1920s.shape









    Out[68]:





(275829, 97)

Rosters

Parse roster files and create single table



In [19]:

    
roster_files = glob.glob("{}/*seve/*.ROS".format(DATA_DIR))



In [32]:

    
roster_files[-1].split('/')[-1][3:7]









    Out[32]:





'2015'



In [39]:

    
def parse_roster_file(filename):
    
    df = pd.read_csv(filename, header=None, 
                     names=("Retrosheet_ID", "Last_Name", "First_Name",
                            "Bats", "Pitches", "Team", "Position"), 
                     na_values=['X'])
    df['Year'] = int(filename.split('/')[-1][3:7])
    return df



In [41]:

    
rosters = pd.concat([parse_roster_file(f) for f in roster_files])



In [42]:

    
rosters.shape









    Out[42]:





(79421, 8)

	Game_ID	Lineup_ID	Inning	Home	Retrosheet_ID	Count	Pitches	Play
301	ANA201004080	2	7	0	hudso001	1	C+3X	43/G
302	ANA201004080	2	7	0	mauej001	21	BBCX	53/G
303	ANA201004080	2	7	1	aybae001	0	NaN	NP
304	ANA201004080	3	7	1	aybae001	12	.LFFFBT	K
305	ANA201004080	3	7	1	abreb001	22	CFBFBFX	8/F
306	ANA201004080	3	7	1	huntt001	2	CFFS	K
307	ANA201004080	3	8	0	mornj001	0	NaN	NP
308	ANA201004080	4	8	0	mornj001	32	.CBBBCFFB	W
309	ANA201004080	4	8	0	cuddm001	10	BX	S7/G.1-2
310	ANA201004080	4	8	0	thomj002	0	X	HR/89/F.2-H;1-H
311	ANA201004080	4	8	0	kubej002	2	FCS	K
312	ANA201004080	4	8	0	hardj003	21	BSBX	S8/G
313	ANA201004080	4	8	0	harrb001	22	B*BFFFS	K
314	ANA201004080	4	8	0	spand001	11	BCX	43/G
315	ANA201004080	4	8	1	matsh001	0	NaN	NP
316	ANA201004080	5	8	1	matsh001	0	NaN	NP
317	ANA201004080	6	8	1	matsh001	20	..BBX	S57/G+
318	ANA201004080	6	8	1	morak001	12	BCFS	K
319	ANA201004080	6	8	1	rivej001	2	FCS	K
320	ANA201004080	6	8	1	kendh001	32	BFBCFB>S	K

				Retrosheet_ID	Name	Position
Game_ID	Lineup_ID	Home	Order
ANA201004050	0	0	1	spand001	Denard Span	8
			2	hudso001	Orlando Hudson	4
			3	mauej001	Joe Mauer	2
			4	mornj001	Justin Morneau	3
			5	cuddm001	Michael Cuddyer	9
			6	kubej002	Jason Kubel	10
			7	yound003	Delmon Young	7
			8	hardj003	J.J. Hardy	6
			9	puntn001	Nick Punto	5
			0	bakes002	Scott Baker	1
		1	1	aybae001	Erick Aybar	6
			2	abreb001	Bobby Abreu	9
			3	huntt001	Torii Hunter	8
			4	matsh001	Hideki Matsui	10
			5	morak001	Kendry Morales	3
			6	rivej001	Juan Rivera	7
			7	kendh001	Howard Kendrick	4
			8	woodb003	Brandon Wood	5
			9	mathj001	Jeff Mathis	2
			0	weavj003	Jered Weaver	1
	1	0	0	craij001	Jesse Crain	1
	2	1	0	jepsk001	Kevin Jepsen	1
	3	0	9	thomj002	Jim Thome	11
	4	0	9	harrb001	Brendan Harris	5
	5	0	0	mijaj001	Jose Mijares	1

		Retrosheet_ID	Name	Position
Home	Order
0	1	coghc001	Chris Coghlan	7
	2	sancg001	Gaby Sanchez	3
	3	ramih003	Hanley Ramirez	6
	4	cantj001	Jorge Cantu	10
	5	uggld001	Dan Uggla	4
	6	rossc001	Cody Ross	8
	7	stanm004	Mike Stanton	9
	8	helmw001	Wes Helms	5
	9	paulr001	Ronny Paulino	2
	0	tankt001	Taylor Tankersley	1
1	1	pattc001	Corey Patterson	7
	2	tejam001	Miguel Tejada	5
	3	markn001	Nick Markakis	9
	4	wiggt001	Ty Wigginton	3
	5	scotl001	Luke Scott	10
	6	jonea003	Adam Jones	8
	7	wietm001	Matt Wieters	2
	8	moors001	Scott Moore	4
	9	lugoj001	Julio Lugo	6
	0	berkj001	Jason Berken	1