In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
Assumes data is in DATA_DIR
and includes the directory and file structure of data as downloaded from retrosheet.org.
In [2]:
DATA_DIR = '../data'
In [3]:
import glob
event_files = glob.glob("{}/*seve/*.EV*".format(DATA_DIR))
Constrain to years of interest
In [4]:
years = list(range(2010, 2016))
In [5]:
year_files = [f for f in event_files if int(f.split('/')[-1][:4]) in years]
Parse game information, including plays and lineup changes.
In [6]:
from io import StringIO
def parse_event_file(filename):
game_info_io, game_play_io, lineup_io = str_io_list = [StringIO() for _ in range(3)]
with open(filename) as f:
game_id = ''
lineup_id = 0
new_game = True
for line in f:
line = line.rstrip('\n')
if line.startswith('id'):
game_id = line.split(',')[-1]
elif line.startswith('start'):
if not new_game:
new_game = True
lineup_id = 0
lineup_io.write(','.join([game_id, str(lineup_id)]
+ line.split(',')[1:]) + '\n')
elif line.startswith('info'):
game_info_io.write(','.join([game_id] + line.split(',')[1:]) + '\n')
elif line.startswith('play'):
game_play_io.write(','.join([game_id, str(lineup_id)]
+ line.split(',')[1:]) + '\n')
elif line.startswith('sub'):
new_game = False
lineup_id += 1
lineup_io.write(','.join([game_id, str(lineup_id)]
+ line.split(',')[1:]) + '\n')
# "rewind" to the beginning of the StringIO object
for str_io in str_io_list:
str_io.seek(0)
return (pd.read_csv(game_info_io, header=None,
names=['Game_ID','Var','Value']).pivot('Game_ID','Var','Value'),
pd.read_csv(game_play_io, header=None, index_col=False,
names=['Game_ID', 'Lineup_ID', 'Inning', 'Home', 'Retrosheet_ID',
'Count', 'Pitches', 'Play']),
pd.read_csv(lineup_io, header=None, index_col=False,
names=['Game_ID', 'Lineup_ID', 'Retrosheet_ID', 'Name', 'Home',
'Order', 'Position']))
Parse all files
In [7]:
parsed_files = [parse_event_file(f) for f in year_files]
Concatenate game info, plays and lineup data
In [8]:
games = pd.concat([df[0] for df in parsed_files], ignore_index=True)
games.shape
Out[8]:
In [9]:
plays = pd.concat([df[1] for df in parsed_files], ignore_index=True)
plays.shape
Out[9]:
In [10]:
plays.loc[301:320]
Out[10]:
In [11]:
lineups = pd.concat([df[2] for df in parsed_files], ignore_index=True)
lineups.shape
Out[11]:
Create hierarchical index for lineups
In [12]:
lineups_hi = lineups.set_index(['Game_ID', 'Lineup_ID', 'Home', 'Order'])
lineups_hi.head(25)
Out[12]:
Function for constructing a given lineup from lineup changes
In [13]:
def get_lineup(game_id, lineup_id, data=lineups_hi):
game_data = data.loc[game_id]
current_lineup = game_data.loc[0].copy()
try:
for l in range(lineup_id):
lineup_change = game_data.loc[l+1]
current_lineup.loc[lineup_change.index] = lineup_change
except IndexError:
print('Invalid lineup number', lineup_id)
return None
return current_lineup
For example, pick arbitrary play and reconstruct the lineup at the time:
In [14]:
plays.loc[24776]
Out[14]:
In [15]:
get_lineup('BAL201006220', 4)
Out[15]:
In [52]:
fields = pd.read_csv('../data/fields.csv', index_col=0)
In [66]:
def cwevent(year):
try:
year_dir = '/'.join(glob.glob("{}/*seve/{}*.EV*".format(DATA_DIR, year))[0].split('/')[:-1])
except:
print('No data for', year)
return
!cd $year_dir; cwevent -y $year -q -f 0-96 $year*.EV* > tmp.csv
df = pd.read_csv(year_dir+'/tmp.csv', header=None, names=fields.Header)
!rm $year_dir/tmp.csv
return df
In [67]:
events1920s = pd.concat([cwevent(y) for y in range(1921, 1930)])
In [68]:
events1920s.shape
Out[68]:
In [19]:
roster_files = glob.glob("{}/*seve/*.ROS".format(DATA_DIR))
In [32]:
roster_files[-1].split('/')[-1][3:7]
Out[32]:
In [39]:
def parse_roster_file(filename):
df = pd.read_csv(filename, header=None,
names=("Retrosheet_ID", "Last_Name", "First_Name",
"Bats", "Pitches", "Team", "Position"),
na_values=['X'])
df['Year'] = int(filename.split('/')[-1][3:7])
return df
In [41]:
rosters = pd.concat([parse_roster_file(f) for f in roster_files])
In [42]:
rosters.shape
Out[42]: