In [30]:
from pandas import DataFrame, read_csv, cut
df = read_csv('data/baseball_data.csv')
df.describe()
Out[30]:
There are 1157 players' records. Interestingly, some players have zero batting average or home runs, so I removed those rows.
In [31]:
df = df[(df.avg > 0.0) & (df.HR > 0)]
import numpy as np
df['avg_category'] = cut(df.avg, bins = np.linspace(0.1, 0.35, 6),
right=False)
df.describe()
Out[31]:
After cleaning meaningless rows, I had 871 records. Now I wanted to figure out player's performance with different handedness and avg_category. I showed histogram, HR vs. avg_category with 3 types of handedness.
In [32]:
a = df.drop(['height', 'weight', 'avg'], axis=1).groupby(by=['avg_category', 'handedness']).sum()
a
Out[32]:
In [33]:
a.to_csv('data/cleaned_baseball.csv')
In [34]:
df = read_csv('data/cleaned_baseball.csv')
subtotal = df.groupby('avg_category', as_index=False).sum()
subtotal
Out[34]:
In [35]:
final_df = df.merge(subtotal, on='avg_category')
In [36]:
final_df['HR_ratio'] = final_df.HR_x / final_df.HR_y
final_df
Out[36]:
In [37]:
final_df.to_csv('data/cleaned_baseball.csv')
In [39]:
df = read_csv('data/cleaned_baseball.csv')
print df
submean = df.drop(['Unnamed: 0']).groupby('avg_category', as_index=False).mean()
In [20]:
df.merge(submean, on='avg_category')
Out[20]:
In [ ]: