In [1]:
%matplotlib inline
In [2]:
# ref: http://seaborn.pydata.org/generated/seaborn.distplot.html#seaborn.distplot
# ref: http://seaborn.pydata.org/generated/seaborn.color_palette.html#seaborn.color_palette
# palette=deep, muted, bright, pastel, dark, colorblind
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# set default size of plots
plt.rcParams['figure.figsize']=4,2
In [3]:
# palettes for seaborn plots
sns.palettes.SEABORN_PALETTES.keys()
sns.set_palette(palette='deep')
In [4]:
# palettes for pandas plots
import pprint
pprint.pprint(plt.style.available)
plt.style.use('fivethirtyeight')
In [5]:
df = pd.read_csv('https://github.com/chadwickbureau/baseballdatabank/raw/master/core/Master.csv')
In [6]:
df.head()
Out[6]:
In [7]:
df.describe()
Out[7]:
In [8]:
# number of batters by hitting hand
batters = df.groupby('bats')['playerID'].count()
ax = batters.plot.barh()
print(batters)
In [9]:
# mean height, grouped by hitting hand
batters = df.groupby('bats')['height'].mean()
ax = batters.plot.barh()
print(batters)
In [10]:
# mean weight, grouped by hitting hand
batters = df.groupby('bats')['weight'].mean()
ax = batters.plot.barh()
print(batters)
In [11]:
# ref: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html
# ref: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html
# ref: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.values.html
import random
# ensure sampling is consistant betweeen runs
seed = 2
# create index dataframes
l_hitters = df['bats'] == 'L'
r_hitters = df['bats'] == 'R'
# obtain the heights arrays for left hand hitters, and right hand hitters
l_height = df[l_hitters]['height'].dropna().sample(n=250, random_state=seed).values
r_height = df[r_hitters]['height'].dropna().sample(n=250, random_state=seed).values
# obtain the weights arrays for left hand hitters, and right hand hitters
l_weight = df[l_hitters]['weight'].dropna().sample(n=250, random_state=seed).values
r_weight = df[r_hitters]['weight'].dropna().sample(n=250, random_state=seed).values
In [12]:
# is there any difference in height?
# visual insepection
fig, ax = plt.subplots()
fig = sns.distplot(l_height, ax=ax, color='blue', kde_kws={"label": "Left Hitters"})
fig = sns.distplot(r_height, ax=ax, color='green', kde_kws={"label": "Right Hitters"})
In [13]:
# t-Test
# ref: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html
# ref: https://en.wikipedia.org/wiki/Welch%27s_t-test
from scipy import stats
# perform Welch's t-Test
t, p = stats.ttest_ind(l_height, r_height, equal_var=False)
if (p <= 0.05):
outcome = '(Passed Test - at the 5% level) Null hypothesis rejected, therefore the mean heights are different'
else:
outcome = '(Failed Test - at the 5% level) Null hypothesis not rejected, therefore we consider the mean heights to be equal'
print("P-value\t= {:5.3}\nt-stat\t= {:5.3}\n{}".format(p,t,outcome))
In [14]:
# is there any difference in weight?
# visual inspection
fig, ax = plt.subplots()
fig = sns.distplot(l_weight, ax=ax, color='blue', kde_kws={"label": "Left Hitters"})
fig = sns.distplot(r_weight, ax=ax, color='green', kde_kws={"label": "Right Hitters"})
In [15]:
# perform Welch's t-Test
t, p = stats.ttest_ind(l_weight, r_weight, equal_var=False)
if (p <= 0.05):
outcome = '(Passed Test - at the 5% level) Null hypothesis rejected, therefore the mean weights are different'
else:
outcome = '(Failed Test - at the 5% level) Null hypothesis not rejected, therefore we consider the mean weights to be equal'
print("P-value\t= {:5.3}\nt-stat\t= {:5.3}\n{}".format(p,t,outcome))