In [77]:
%pylab inline
import pymc
import pandas as pd
pd.options.display.line_width = 2000 # fixes data frames display issue (bug)
# load the data and take a look
data = pd.read_table('laa_2011_April.txt')
data
Out[77]:
In [2]:
player_MLEs = pd.DataFrame({'Player': data['Player'], 'MLE': data['AVG']})
player_MLEs.set_index('Player', inplace=True)
player_MLEs
Out[2]:
For a Beta distribution we know that:
From: http://en.wikipedia.org/wiki/Beta_distribution
E[X]=$\frac{\alpha}{\alpha + \beta}$ Var[X]=$\frac{\alpha \beta}{(\alpha + \beta)^2 (\alpha +\beta +1)}$
In 2010 the mean batting average was 0.255 and the variance between players was 0.0011.
Using the values above and solving the two above equations for $\alpha$ and $\beta$:
$\alpha =43.78, \beta =127.92$
In [3]:
# load in the model
import PlayerModel
M = pymc.MCMC(PlayerModel)
N_samp = 10000 # simple model, so can do many samplings
N_burn = N_samp/10
M.sample(N_samp, burn=N_burn, thin=5)
In [5]:
from pymc import Matplot
# Plot the outputs, and look at traces for convergence
Matplot.plot(M)
""" note that the traces converge (and because of burning the first 1/10th of the trials
it is difficult to see the initial converging behavior. Also note that the auto-correlation
disappears rapidly """
In [4]:
# To be extra sure of convergence, look at the z-score plots of the different parameters
from pymc import geweke
from pymc.Matplot import geweke_plot
scores = geweke(M)
geweke_plot(scores)
In [70]:
# load in the full-season data
year_data = pd.read_table('laa_2011_full.txt')
# Pull posterior means and confidence intervals out of M.stats()
means = [round(M.stats()['mu'+str(i)]['mean'], 3) for i in xrange(len(data))]
CIs = [(round(x[0], 3), round(x[1], 3)) for x in [M.stats()['mu'+str(i)]['95% HPD interval'] for i in xrange(len(data))]]
inCI = [(x >= y[0]) and (x <= y[1]) for x,y in zip(year_data['AVG'], CIs)]
posterior_data = pd.DataFrame({'Player':data['Player'], '95% CI':CIs, 'Posterior Mean':means,
'April Mean (MLE)':data['AVG'], 'Full Season Mean':year_data['AVG'],
'Full Season in CI?':inCI})
posterior_data.set_index('Player', inplace=True)
print "{0:.0f} of the 13 players' ".format(sum(posterior_data['Full Season in CI?'])) + \
"full-season batting average falls within the 95% CI"
posterior_data
Out[70]:
In [124]:
# Plotting data
FSmean = list(posterior_data['Full Season Mean'])
MLE = list(posterior_data['April Mean (MLE)'])
# Setup
ind = np.arange(len(posterior_data)) # the x locations for the groups
width = 0.35 # the width of the bars
fig = plt.figure(figsize=(20, 6))
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, FSmean, width, color='b')
rects2 = ax.bar(ind+width, MLE, width, color='g')
# labels
ax.set_ylabel('Batting Average')
ax.set_title('Inferenced Batting Averages')
ax.set_xticks(ind+width)
ax.set_xticklabels( tuple(data['Player']) )
ax.legend( (rects1[0], rects2[0]), ('Full Season Mean', 'April-Based MLE') )
def autolabel(rects):
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2.0, 1.03*height, str(height),
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
Plot Full Season Batting Average vs Posterior Mean (with confidence interval shown):
In [126]:
print "Error bars represent 95% confidence interval."
# Plotting data
FSmean = list(posterior_data['Full Season Mean'])
PosteriorMean = list(posterior_data['Posterior Mean'])
CI_low = [abs(x[0]-y) for x,y in zip(posterior_data['95% CI'], posterior_data['Posterior Mean'])]
CI_high = [abs(x[1]-y) for x,y in zip(posterior_data['95% CI'], posterior_data['Posterior Mean'])]
# setup
ind = np.arange(len(posterior_data)) # the x locations for the groups
width = 0.35 # the width of the bars
fig = plt.figure(figsize=(20, 6))
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, FSmean, width, color='b')
rects2 = ax.bar(ind+width, PosteriorMean, width, color='cyan', yerr=[CI_low, CI_high])
# labels
ax.set_ylabel('Batting Average')
ax.set_title('Inferenced Batting Averages')
ax.set_xticks(ind+width)
ax.set_xticklabels( tuple(data['Player']) )
ax.legend( (rects1[0], rects2[0]), ('Full Season Mean', 'Posterior Mean') )
def autolabel(rects):
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2.0, 1.03*height, str(height),
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)