Homework 11

Sean Lubner



In [77]:

    
%pylab inline
import pymc
import pandas as pd
pd.options.display.line_width = 2000 # fixes data frames display issue (bug)

# load the data and take a look
data = pd.read_table('laa_2011_April.txt')
data









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['test']
`%pylab --no-import-all` prevents importing * from pylab and numpy






    Out[77]:






  
    
      
      RK
      Player
      Team
      Pos
      G
      AB
      R
      H
      2B
      3B
      HR
      RBI
      BB
      SO
      SB
      CS
      AVG
      OBP
      SLG
      OPS
    
  
  
    
      0 
        1
          Aybar, E
       LAA
       SS
       12
        47
        3
       17
       3
       0
       0
        3
        0
        4
       4
       0
       0.362
       0.362
       0.426
       0.787
    
    
      1 
        2
        Izturis, M
       LAA
       3B
       20
        81
        8
       25
       7
       0
       2
        8
        6
       14
       2
       2
       0.309
       0.356
       0.469
       0.825
    
    
      2 
        3
       Kendrick, H
       LAA
       2B
       26
       104
       19
       32
       6
       1
       6
       12
       11
       22
       1
       0
       0.308
       0.385
       0.558
       0.942
    
    
      3 
        4
       Callaspo, A
       LAA
       3B
       24
        89
       13
       27
       3
       0
       2
        8
       12
       10
       0
       0
       0.303
       0.386
       0.404
       0.791
    
    
      4 
        5
        Bourjos, P
       LAA
       CF
       26
        90
       10
       27
       5
       4
       2
        9
        5
       28
       2
       4
       0.300
       0.333
       0.511
       0.844
    
    
      5 
        6
         Conger, H
       LAA
        C
       14
        44
        6
       12
       2
       0
       2
        9
        3
        9
       0
       0
       0.273
       0.333
       0.455
       0.788
    
    
      6 
        7
          Abreu, B
       LAA
       RF
       25
        89
       12
       23
       6
       0
       1
        8
       22
       21
       2
       1
       0.258
       0.405
       0.360
       0.765
    
    
      7 
        8
       Amarista, A
       LAA
       2B
        3
         8
        0
        2
       1
       0
       0
        3
        1
        3
       0
       0
       0.250
       0.300
       0.375
       0.675
    
    
      8 
        8
         Trumbo, M
       LAA
       3B
       23
        84
       11
       21
       5
       0
       4
       13
        2
       21
       2
       0
       0.250
       0.276
       0.452
       0.728
    
    
      9 
       10
         Hunter, T
       LAA
       CF
       26
       103
        8
       22
       2
       0
       3
       11
       10
       24
       1
       0
       0.214
       0.287
       0.320
       0.607
    
    
      10
       11
          Wells, V
       LAA
       CF
       25
       105
       10
       18
       2
       1
       1
        6
        5
       21
       0
       1
       0.171
       0.207
       0.238
       0.445
    
    
      11
       12
         Mathis, J
       LAA
        C
       13
        46
        3
        7
       3
       0
       0
        4
        1
       16
       0
       0
       0.152
       0.167
       0.217
       0.384
    
    
      12
       14
         Wilson, B
       LAA
        C
        5
         8
        0
        1
       0
       0
       0
        0
        1
        2
       0
       0
       0.125
       0.222
       0.125
       0.347

Problem (a)

MLE is the average success rate of each player = $\frac{H}{BA}$ = average:



In [2]:

    
player_MLEs = pd.DataFrame({'Player': data['Player'], 'MLE': data['AVG']})
player_MLEs.set_index('Player', inplace=True)
player_MLEs









    Out[2]:






  
    
      
      MLE
    
    
      Player
      
    
  
  
    
      Aybar, E
       0.362
    
    
      Izturis, M
       0.309
    
    
      Kendrick, H
       0.308
    
    
      Callaspo, A
       0.303
    
    
      Bourjos, P
       0.300
    
    
      Conger, H
       0.273
    
    
      Abreu, B
       0.258
    
    
      Amarista, A
       0.250
    
    
      Trumbo, M
       0.250
    
    
      Hunter, T
       0.214
    
    
      Wells, V
       0.171
    
    
      Mathis, J
       0.152
    
    
      Wilson, B
       0.125

Problem (b)

For a Beta distribution we know that:

From: http://en.wikipedia.org/wiki/Beta_distribution

E[X]=$\frac{\alpha}{\alpha + \beta}$ Var[X]=$\frac{\alpha \beta}{(\alpha + \beta)^2 (\alpha +\beta +1)}$

In 2010 the mean batting average was 0.255 and the variance between players was 0.0011.

Using the values above and solving the two above equations for $\alpha$ and $\beta$:

$\alpha =43.78, \beta =127.92$



In [3]:

    
# load in the model
import PlayerModel

M = pymc.MCMC(PlayerModel)
N_samp = 10000 # simple model, so can do many samplings
N_burn = N_samp/10
M.sample(N_samp, burn=N_burn, thin=5)









    



 [-----------------100%-----------------] 10000 of 10000 complete in 41.0 sec

Problem (c)

Check for convergence (it does converge)



In [5]:

    
from pymc import Matplot

# Plot the outputs, and look at traces for convergence
Matplot.plot(M)

""" note that the traces converge (and because of burning the first 1/10th of the trials
    it is difficult to see the initial converging behavior.  Also note that the auto-correlation
    disappears rapidly """









    



Plotting mu4
Plotting mu0
Plotting mu2
Plotting mu7
Plotting mu8
Plotting mu5
Plotting mu1
Plotting mu6
Plotting mu3
Plotting mu11
Plotting mu9
Plotting mu12
Plotting mu10



In [4]:

    
# To be extra sure of convergence, look at the z-score plots of the different parameters
from pymc import geweke
from pymc.Matplot import geweke_plot
scores = geweke(M)
geweke_plot(scores)

Problem (d)

Compute the posterior means and 95% confidence intervals



In [70]:

    
# load in the full-season data
year_data = pd.read_table('laa_2011_full.txt')

# Pull posterior means and confidence intervals out of M.stats()
means = [round(M.stats()['mu'+str(i)]['mean'], 3) for i in xrange(len(data))]
CIs = [(round(x[0], 3), round(x[1], 3)) for x in [M.stats()['mu'+str(i)]['95% HPD interval'] for i in xrange(len(data))]]
inCI = [(x >= y[0]) and (x <= y[1]) for x,y in zip(year_data['AVG'], CIs)]

posterior_data = pd.DataFrame({'Player':data['Player'], '95% CI':CIs, 'Posterior Mean':means, 
                               'April Mean (MLE)':data['AVG'], 'Full Season Mean':year_data['AVG'],
                               'Full Season in CI?':inCI})
posterior_data.set_index('Player', inplace=True)

print "{0:.0f} of the 13 players' ".format(sum(posterior_data['Full Season in CI?'])) + \
        "full-season batting average falls within the 95% CI"
posterior_data









    



11 of the 13 players' full-season batting average falls within the 95% CI






    Out[70]:






  
    
      
      95% CI
      April Mean (MLE)
      Full Season Mean
      Full Season in CI?
      Posterior Mean
    
    
      Player
      
      
      
      
      
    
  
  
    
      Aybar, E
       (0.219, 0.335)
       0.362
       0.288
        True
       0.278
    
    
      Izturis, M
        (0.224, 0.33)
       0.309
       0.285
        True
       0.274
    
    
      Kendrick, H
       (0.222, 0.327)
       0.308
       0.279
        True
       0.275
    
    
      Callaspo, A
       (0.216, 0.324)
       0.303
       0.276
        True
       0.273
    
    
      Bourjos, P
        (0.222, 0.33)
       0.300
       0.271
        True
       0.270
    
    
      Conger, H
        (0.204, 0.32)
       0.273
       0.262
        True
       0.260
    
    
      Abreu, B
       (0.201, 0.308)
       0.258
       0.254
        True
       0.256
    
    
      Amarista, A
       (0.196, 0.321)
       0.250
       0.253
        True
       0.256
    
    
      Trumbo, M
       (0.198, 0.306)
       0.250
       0.218
        True
       0.253
    
    
      Hunter, T
       (0.191, 0.291)
       0.214
       0.209
        True
       0.238
    
    
      Wells, V
        (0.177, 0.27)
       0.171
       0.189
        True
       0.223
    
    
      Mathis, J
       (0.175, 0.289)
       0.152
       0.174
       False
       0.232
    
    
      Wilson, B
       (0.186, 0.312)
       0.125
       0.154
       False
       0.251

Problem (e)

Plot Full Season Batting Average vs MLE



In [124]:

    
# Plotting data
FSmean = list(posterior_data['Full Season Mean'])
MLE = list(posterior_data['April Mean (MLE)'])

# Setup
ind = np.arange(len(posterior_data))  # the x locations for the groups
width = 0.35       # the width of the bars
fig = plt.figure(figsize=(20, 6))
ax = fig.add_subplot(111)

rects1 = ax.bar(ind, FSmean, width, color='b')
rects2 = ax.bar(ind+width, MLE, width, color='g')

# labels
ax.set_ylabel('Batting Average')
ax.set_title('Inferenced Batting Averages')
ax.set_xticks(ind+width)
ax.set_xticklabels( tuple(data['Player']) )

ax.legend( (rects1[0], rects2[0]), ('Full Season Mean', 'April-Based MLE') )

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2.0, 1.03*height, str(height),
                ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

Plot Full Season Batting Average vs Posterior Mean (with confidence interval shown):



In [126]:

    
print "Error bars represent 95% confidence interval."

# Plotting data
FSmean = list(posterior_data['Full Season Mean'])
PosteriorMean = list(posterior_data['Posterior Mean'])
CI_low = [abs(x[0]-y) for x,y in zip(posterior_data['95% CI'], posterior_data['Posterior Mean'])]
CI_high = [abs(x[1]-y) for x,y in zip(posterior_data['95% CI'], posterior_data['Posterior Mean'])]

# setup
ind = np.arange(len(posterior_data))  # the x locations for the groups
width = 0.35       # the width of the bars
fig = plt.figure(figsize=(20, 6))
ax = fig.add_subplot(111)

rects1 = ax.bar(ind, FSmean, width, color='b')
rects2 = ax.bar(ind+width, PosteriorMean, width, color='cyan', yerr=[CI_low, CI_high])

# labels
ax.set_ylabel('Batting Average')
ax.set_title('Inferenced Batting Averages')
ax.set_xticks(ind+width)
ax.set_xticklabels( tuple(data['Player']) )

ax.legend( (rects1[0], rects2[0]), ('Full Season Mean', 'Posterior Mean') )

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2.0, 1.03*height, str(height),
                ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)









    



Error bars represent 95% confidence interval.

	RK	Player	Team	Pos	G	AB	R	H	2B	3B	HR	RBI	BB	SO	SB	CS	AVG	OBP	SLG	OPS
0	1	Aybar, E	LAA	SS	12	47	3	17	3	0	0	3	0	4	4	0	0.362	0.362	0.426	0.787
1	2	Izturis, M	LAA	3B	20	81	8	25	7	0	2	8	6	14	2	2	0.309	0.356	0.469	0.825
2	3	Kendrick, H	LAA	2B	26	104	19	32	6	1	6	12	11	22	1	0	0.308	0.385	0.558	0.942
3	4	Callaspo, A	LAA	3B	24	89	13	27	3	0	2	8	12	10	0	0	0.303	0.386	0.404	0.791
4	5	Bourjos, P	LAA	CF	26	90	10	27	5	4	2	9	5	28	2	4	0.300	0.333	0.511	0.844
5	6	Conger, H	LAA	C	14	44	6	12	2	0	2	9	3	9	0	0	0.273	0.333	0.455	0.788
6	7	Abreu, B	LAA	RF	25	89	12	23	6	0	1	8	22	21	2	1	0.258	0.405	0.360	0.765
7	8	Amarista, A	LAA	2B	3	8	0	2	1	0	0	3	1	3	0	0	0.250	0.300	0.375	0.675
8	8	Trumbo, M	LAA	3B	23	84	11	21	5	0	4	13	2	21	2	0	0.250	0.276	0.452	0.728
9	10	Hunter, T	LAA	CF	26	103	8	22	2	0	3	11	10	24	1	0	0.214	0.287	0.320	0.607
10	11	Wells, V	LAA	CF	25	105	10	18	2	1	1	6	5	21	0	1	0.171	0.207	0.238	0.445
11	12	Mathis, J	LAA	C	13	46	3	7	3	0	0	4	1	16	0	0	0.152	0.167	0.217	0.384
12	14	Wilson, B	LAA	C	5	8	0	1	0	0	0	0	1	2	0	0	0.125	0.222	0.125	0.347

	MLE
Player
Aybar, E	0.362
Izturis, M	0.309
Kendrick, H	0.308
Callaspo, A	0.303
Bourjos, P	0.300
Conger, H	0.273
Abreu, B	0.258
Amarista, A	0.250
Trumbo, M	0.250
Hunter, T	0.214
Wells, V	0.171
Mathis, J	0.152
Wilson, B	0.125

	95% CI	April Mean (MLE)	Full Season Mean	Full Season in CI?	Posterior Mean
Player
Aybar, E	(0.219, 0.335)	0.362	0.288	True	0.278
Izturis, M	(0.224, 0.33)	0.309	0.285	True	0.274
Kendrick, H	(0.222, 0.327)	0.308	0.279	True	0.275
Callaspo, A	(0.216, 0.324)	0.303	0.276	True	0.273
Bourjos, P	(0.222, 0.33)	0.300	0.271	True	0.270
Conger, H	(0.204, 0.32)	0.273	0.262	True	0.260
Abreu, B	(0.201, 0.308)	0.258	0.254	True	0.256
Amarista, A	(0.196, 0.321)	0.250	0.253	True	0.256
Trumbo, M	(0.198, 0.306)	0.250	0.218	True	0.253
Hunter, T	(0.191, 0.291)	0.214	0.209	True	0.238
Wells, V	(0.177, 0.27)	0.171	0.189	True	0.223
Mathis, J	(0.175, 0.289)	0.152	0.174	False	0.232
Wilson, B	(0.186, 0.312)	0.125	0.154	False	0.251