In [1]:
from __future__ import print_function, division
%matplotlib inline
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# use matplotlib style sheet
plt.style.use('ggplot')
In [2]:
# import the t-distribution from scipy.stats
from scipy.stats import t
In [3]:
y = np.array([35,34,38,35,37])
y
Out[3]:
In [4]:
n = len(y)
n
Out[4]:
In [5]:
estimate = np.mean(y)
estimate
Out[5]:
Numpy uses a denominator of N in the standard deviation calculation by
default, instead of N-1. To use N-1, the unbiased estimator-- and to
agree with the R output, we have to give np.std()
the argument ddof=1
:
In [6]:
se = np.std(y, ddof=1)/np.sqrt(n)
se
Out[6]:
In [7]:
int50 = estimate + t.ppf([0.25, 0.75], n-1)*se
int50
Out[7]:
In [8]:
int95 = estimate + t.ppf([0.025, 0.975], n-1)*se
int95
Out[8]:
In [9]:
from scipy.stats import norm
In [10]:
y = 700
y
Out[10]:
In [11]:
n = 1000
n
Out[11]:
In [12]:
estimate = y/n
estimate
Out[12]:
In [13]:
se = np.sqrt(estimate*(1-estimate)/n)
se
Out[13]:
In [14]:
int95 = estimate + norm.ppf([.025,0.975])*se
int95
Out[14]:
In [15]:
y = np.repeat([0,1,2,3,4], [600,300, 50, 30, 20])
y
Out[15]:
In [16]:
n = len(y)
n
Out[16]:
In [17]:
estimate = np.mean(y)
estimate
Out[17]:
See the note above about the difference different defaults for standard deviation in Python and R.
In [18]:
se = np.std(y, ddof=1)/np.sqrt(n)
se
Out[18]:
In [19]:
int50 = estimate + t.ppf([0.25, 0.75], n-1)*se
int50
Out[19]:
In [20]:
int95 = estimate + t.ppf([0.025, 0.975], n-1)*se
int95
Out[20]:
The polls.dat file has an unusual format. The data that we would like to have in a single row is split across 4 rows:
The data seems to be a subset of the Gallup data, available here: http://www.gallup.com/poll/1606/Death-Penalty.aspx
We can see the unusual layout using the bash command head (linux/osx only, sorry..)
In [21]:
%%bash
head ../../ARM_Data/death.polls/polls.dat
Using knowledge of the file layout we can read in the file and pre-process into appropriate rows/columns for passing into a pandas dataframe:
In [22]:
# Data is available in death.polls directory of ARM_Data
data = []
temp = []
ncols = 5
with open("../../ARM_Data/death.polls/polls.dat") as f:
for line in f.readlines():
for d in line.strip().split(' '):
temp.append(float(d))
if (len(temp) == ncols):
data.append(temp)
temp = []
polls = pd.DataFrame(data, columns=[u'year', u'month', u'perc for',
u'perc against', u'perc no opinion'])
polls.head()
Out[22]:
In [23]:
# --Note: this give the (percent) support for thise that have an opinion
# --The percentage with no opinion are ignored
# --This results in difference between our plot (below) and the Gallup plot (link above)
polls[u'support'] = polls[u'perc for']/(polls[u'perc for']+polls[u'perc against'])
polls.head()
Out[23]:
In [24]:
polls[u'year_float'] = polls[u'year'] + (polls[u'month']-6)/12
polls.head()
Out[24]:
In [25]:
# add error column -- symmetric so only add one column
# assumes sample size N=1000
# uses +/- 1 standard error, resulting in 68% confidence
polls[u'support_error'] = np.sqrt(polls[u'support']*(1-polls[u'support'])/1000)
polls.head()
Out[25]:
In [26]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.errorbar(polls[u'year_float'], 100*polls[u'support'],
yerr=100*polls[u'support_error'], fmt='ko',
ms=4, capsize=0)
plt.ylabel(u'Percentage support for the death penalty')
plt.xlabel(u'Year')
# you can adjust y-limits with command like below
# I will leave the default behavior
#plt.ylim(np.min(100*polls[u'support'])-2, np.max(100*polls[u'support']+2))
Out[26]:
In [27]:
N = np.array([66030000, 81083600, 60788845])
p = np.array([0.55, 0.61, 0.38])
se = np.array([0.02, 0.03, 0.03])
In [28]:
w_avg = np.sum(N*p)/np.sum(N)
w_avg
Out[28]:
In [29]:
se_w_avg = np.sqrt(np.sum((N*se/np.sum(N))**2))
se_w_avg
Out[29]:
In [30]:
# this uses +/- 2 std devs
int_95 = w_avg + np.array([-2,2])*se_w_avg
int_95
Out[30]:
In [31]:
# import the normal from scipy.stats
# repeated to make sure that it is clear that it is needed for this section
from scipy.stats import norm
# also need this for estimating CI from samples
from scipy.stats.mstats import mquantiles
In [32]:
n_men = 500
n_men
Out[32]:
In [33]:
p_hat_men = 0.75
p_hat_men
Out[33]:
In [34]:
se_men = np.sqrt(p_hat_men*(1.-p_hat_men)/n_men)
se_men
Out[34]:
In [35]:
n_women = 500
n_women
Out[35]:
In [36]:
p_hat_women = 0.65
p_hat_women
Out[36]:
In [37]:
se_women = np.sqrt(p_hat_women*(1.-p_hat_women)/n_women)
se_women
Out[37]:
In [38]:
n_sims = 10000
n_sims
Out[38]:
In [39]:
p_men = norm.rvs(size=n_sims, loc=p_hat_men, scale=se_men)
p_men[:10] # show first ten
Out[39]:
In [40]:
p_women = norm.rvs(size=n_sims, loc=p_hat_women, scale=se_women)
p_women[:10] # show first ten
Out[40]:
In [41]:
ratio = p_men/p_women
ratio[:10] # show first ten
Out[41]:
In [42]:
# the values of alphap and betap replicate the R default behavior
# see http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html
int95 = mquantiles(ratio, prob=[0.025,0.975], alphap=1., betap=1.)
int95
Out[42]: