In [1]:
import pandas as pd
import numpy as np

In [13]:
hits = pd.read_csv('./../data/hits.tsv', sep='\t', names=['game', 'inning', 'pitcher', 'hitter', 'class', 'description',
                                                            'x', 'y'])
hits.head()


Out[13]:
game inning pitcher hitter class description x y
0 1 Top Ubaldo Jimenez Ian Kinsler O Groundout 117.47 147.59
1 1 Top Ubaldo Jimenez Torii Hunter O Groundout 107.43 153.61
2 1 Top Ubaldo Jimenez Miguel Cabrera O Groundout 100.40 164.66
3 1 Bottom Drew Smyly Manny Machado O Flyout 83.33 94.38
4 1 Bottom Drew Smyly Adam Jones H Home Run 26.10 99.40

5 rows × 8 columns


In [14]:
hits.shape


Out[14]:
(31527, 8)

In [15]:
nans = pd.isnull(hits).any(1).nonzero()

In [16]:
nans


Out[16]:
(array([], dtype=int64),)

In [17]:
hits = hits.drop(hits.index[nans])
hits.shape


Out[17]:
(31527, 8)

In [18]:
hits = hits.drop_duplicates()
hits.shape


Out[18]:
(31464, 8)

In [20]:
# Have a unique id for each hit, d3 will use this as the key
hits['id'] = hits.index
hits.head()


Out[20]:
game inning pitcher hitter class description x y id
0 1 Top Ubaldo Jimenez Ian Kinsler O Groundout 117.47 147.59 0
1 1 Top Ubaldo Jimenez Torii Hunter O Groundout 107.43 153.61 1
2 1 Top Ubaldo Jimenez Miguel Cabrera O Groundout 100.40 164.66 2
3 1 Bottom Drew Smyly Manny Machado O Flyout 83.33 94.38 3
4 1 Bottom Drew Smyly Adam Jones H Home Run 26.10 99.40 4

5 rows × 9 columns


In [21]:
hits.to_csv("../data/newhits.tsv", sep="\t", index=False)

In [ ]: