2.3: Classical confidence intervals


In [1]:
from __future__ import print_function, division
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# use matplotlib style sheet
plt.style.use('ggplot')

CI for continuous data, Pg 18


In [2]:
# import the t-distribution from scipy.stats
from scipy.stats import t

In [3]:
y = np.array([35,34,38,35,37])
y


Out[3]:
array([35, 34, 38, 35, 37])

In [4]:
n = len(y)
n


Out[4]:
5

In [5]:
estimate = np.mean(y)
estimate


Out[5]:
35.799999999999997

Numpy uses a denominator of N in the standard deviation calculation by default, instead of N-1. To use N-1, the unbiased estimator-- and to agree with the R output, we have to give np.std() the argument ddof=1:


In [6]:
se = np.std(y, ddof=1)/np.sqrt(n)
se


Out[6]:
0.73484692283495334

In [7]:
int50 = estimate + t.ppf([0.25, 0.75], n-1)*se
int50


Out[7]:
array([ 35.25570103,  36.34429897])

In [8]:
int95 = estimate + t.ppf([0.025, 0.975], n-1)*se
int95


Out[8]:
array([ 33.75973786,  37.84026214])

CI for proportions, Pg 18


In [9]:
from scipy.stats import norm

In [10]:
y = 700
y


Out[10]:
700

In [11]:
n = 1000
n


Out[11]:
1000

In [12]:
estimate = y/n
estimate


Out[12]:
0.7

In [13]:
se = np.sqrt(estimate*(1-estimate)/n)
se


Out[13]:
0.014491376746189439

In [14]:
int95 = estimate + norm.ppf([.025,0.975])*se
int95


Out[14]:
array([ 0.67159742,  0.72840258])

CI for discrete data, Pg 18


In [15]:
y = np.repeat([0,1,2,3,4], [600,300, 50, 30, 20])
y


Out[15]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])

In [16]:
n = len(y)
n


Out[16]:
1000

In [17]:
estimate = np.mean(y)
estimate


Out[17]:
0.56999999999999995

See the note above about the difference different defaults for standard deviation in Python and R.


In [18]:
se = np.std(y, ddof=1)/np.sqrt(n)
se


Out[18]:
0.027674281668470926

In [19]:
int50 = estimate + t.ppf([0.25, 0.75], n-1)*se
int50


Out[19]:
array([ 0.55132718,  0.58867282])

In [20]:
int95 = estimate + t.ppf([0.025, 0.975], n-1)*se
int95


Out[20]:
array([ 0.51569361,  0.62430639])

Plot Figure 2.3, Pg 19

The polls.dat file has an unusual format. The data that we would like to have in a single row is split across 4 rows:

  • year month
  • percentage support
  • percentage against
  • percentage no opinion

The data seems to be a subset of the Gallup data, available here: http://www.gallup.com/poll/1606/Death-Penalty.aspx

We can see the unusual layout using the bash command head (linux/osx only, sorry..)


In [21]:
%%bash
head ../../ARM_Data/death.polls/polls.dat


2002 10
 70
 25
 05
2002 5
 72
 25
 03
2001 10
 68

Using knowledge of the file layout we can read in the file and pre-process into appropriate rows/columns for passing into a pandas dataframe:


In [22]:
# Data is available in death.polls directory of ARM_Data
data = []
temp = []
ncols = 5
with open("../../ARM_Data/death.polls/polls.dat") as f:
    for line in f.readlines():
        for d in line.strip().split(' '):
            temp.append(float(d))
        if (len(temp) == ncols):
            data.append(temp)
            temp = []

polls = pd.DataFrame(data, columns=[u'year', u'month', u'perc for', 
                     u'perc against', u'perc no opinion'])
polls.head()


Out[22]:
year month perc for perc against perc no opinion
0 2002 10 70 25 5
1 2002 5 72 25 3
2 2001 10 68 26 6
3 2001 5 65 27 8
4 2001 2 67 25 8

In [23]:
# --Note: this give the (percent) support for thise that have an opinion
# --The percentage with no opinion are ignored
# --This results in difference between our plot (below) and the Gallup plot (link above)
polls[u'support'] = polls[u'perc for']/(polls[u'perc for']+polls[u'perc against'])
polls.head()


Out[23]:
year month perc for perc against perc no opinion support
0 2002 10 70 25 5 0.736842
1 2002 5 72 25 3 0.742268
2 2001 10 68 26 6 0.723404
3 2001 5 65 27 8 0.706522
4 2001 2 67 25 8 0.728261

In [24]:
polls[u'year_float'] = polls[u'year'] + (polls[u'month']-6)/12
polls.head()


Out[24]:
year month perc for perc against perc no opinion support year_float
0 2002 10 70 25 5 0.736842 2002.333333
1 2002 5 72 25 3 0.742268 2001.916667
2 2001 10 68 26 6 0.723404 2001.333333
3 2001 5 65 27 8 0.706522 2000.916667
4 2001 2 67 25 8 0.728261 2000.666667

In [25]:
# add error column -- symmetric so only add one column
# assumes sample size N=1000
# uses +/- 1 standard error, resulting in 68% confidence
polls[u'support_error'] = np.sqrt(polls[u'support']*(1-polls[u'support'])/1000)
polls.head()


Out[25]:
year month perc for perc against perc no opinion support year_float support_error
0 2002 10 70 25 5 0.736842 2002.333333 0.013925
1 2002 5 72 25 3 0.742268 2001.916667 0.013831
2 2001 10 68 26 6 0.723404 2001.333333 0.014145
3 2001 5 65 27 8 0.706522 2000.916667 0.014400
4 2001 2 67 25 8 0.728261 2000.666667 0.014068

In [26]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.errorbar(polls[u'year_float'], 100*polls[u'support'],
             yerr=100*polls[u'support_error'], fmt='ko',
             ms=4, capsize=0)
plt.ylabel(u'Percentage support for the death penalty')
plt.xlabel(u'Year')

# you can adjust y-limits with command like below
# I will leave the default behavior
#plt.ylim(np.min(100*polls[u'support'])-2, np.max(100*polls[u'support']+2))


Out[26]:
<matplotlib.text.Text at 0x7f2e91e95490>

Weighted averages, Pg 19

The example R-code for this part is incomplete, so I will make up N, p and se loosely related to the text on page 19.


In [27]:
N = np.array([66030000, 81083600, 60788845])
p = np.array([0.55, 0.61, 0.38])
se = np.array([0.02, 0.03, 0.03])

In [28]:
w_avg = np.sum(N*p)/np.sum(N)
w_avg


Out[28]:
0.52369397146820473

In [29]:
se_w_avg = np.sqrt(np.sum((N*se/np.sum(N))**2))
se_w_avg


Out[29]:
0.015943244310739957

In [30]:
# this uses +/- 2 std devs
int_95 = w_avg + np.array([-2,2])*se_w_avg
int_95


Out[30]:
array([ 0.49180748,  0.55558046])

CI using simulations, Pg 20


In [31]:
# import the normal from scipy.stats
# repeated to make sure that it is clear that it is needed for this section
from scipy.stats import norm

# also need this for estimating CI from samples
from scipy.stats.mstats import mquantiles

In [32]:
n_men = 500
n_men


Out[32]:
500

In [33]:
p_hat_men = 0.75
p_hat_men


Out[33]:
0.75

In [34]:
se_men = np.sqrt(p_hat_men*(1.-p_hat_men)/n_men)
se_men


Out[34]:
0.019364916731037084

In [35]:
n_women = 500
n_women


Out[35]:
500

In [36]:
p_hat_women = 0.65
p_hat_women


Out[36]:
0.65

In [37]:
se_women = np.sqrt(p_hat_women*(1.-p_hat_women)/n_women)
se_women


Out[37]:
0.02133072900770154

In [38]:
n_sims = 10000
n_sims


Out[38]:
10000

In [39]:
p_men = norm.rvs(size=n_sims, loc=p_hat_men, scale=se_men)
p_men[:10] # show first ten


Out[39]:
array([ 0.71606181,  0.76107201,  0.75180118,  0.745077  ,  0.76690729,
        0.75554288,  0.71039678,  0.72620403,  0.76487515,  0.75743541])

In [40]:
p_women = norm.rvs(size=n_sims, loc=p_hat_women, scale=se_women)
p_women[:10] # show first ten


Out[40]:
array([ 0.64767734,  0.66734972,  0.59585411,  0.67186777,  0.63478534,
        0.63914566,  0.64321054,  0.69318585,  0.66756578,  0.62953845])

In [41]:
ratio = p_men/p_women
ratio[:10] # show first ten


Out[41]:
array([ 1.10558417,  1.14043953,  1.26172022,  1.10896375,  1.20813642,
        1.18211376,  1.10445451,  1.04763251,  1.14576746,  1.20315988])

In [42]:
# the values of alphap and betap replicate the R default behavior
# see http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html
int95 = mquantiles(ratio, prob=[0.025,0.975], alphap=1., betap=1.)
int95


Out[42]:
array([ 1.06112134,  1.25035609])