This notebook seeks to convey a core concept of inferential thinking - given some set of observations about a small sample of a population, attempt to draw robust conclusions about the (unobservable) population.
Here we create a hypothetical population through simulation. It is based on the historical discussion in the data8 lecture about estimating the size of foreign bomber fleets from the observations of tail markings.
In [2]:
# HIDDEN
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
# datascience version number of last run of this notebook
version.__version__
Out[2]:
In [3]:
# The magic number - size of the population that (in the real world)
# we don't know and want to estimate
def createPopulation():
def serNo(x):
return "{:05d}".format(x)
p = Table([np.arange(1,37*55)],["Ser No"])
p.set_format("Ser No", serNo)
return p
In [4]:
# Create a simulation of the population as a table - ordered collection of named columns
population = createPopulation()
population
Out[4]:
In [5]:
# computational thinking - simulate observing a sample of the population
sample_size = 10
In [18]:
population.sample(sample_size,with_replacement=True)
Out[18]:
In [19]:
# Simulate observing multiple samples
nsamples = 30
In [20]:
# use iteration to create a table of samples
samples = Table()
for i in range(nsamples):
name = "sample-"+str(i)
a_sample = population.sample(sample_size,with_replacement=True)
samples[name] = a_sample["Ser No"]
samples
Out[20]:
In [21]:
# gracefully transition between tables and arrays
samples['sample-0']
Out[21]:
In [23]:
# define a function to capture formally a idea about how to do the estimation
def estimateA(smpl) :
return np.max(smpl)
In [24]:
estimateA(samples['sample-2'])
Out[24]:
In [26]:
# you might come up with lots of other estimators
def estimateB(smpl) :
return 2*np.mean(smpl)
In [27]:
#verify it works
estimateA(samples["sample-0"])
Out[27]:
In [29]:
# illustrate list comprehension to explore data
[estimateB(samples[s]) for s in samples]
Out[29]:
In [30]:
# Build a tables of estimates
estA = Table([[estimateA(samples[s]) for s in samples]],['ests'])
estA
Out[30]:
In [31]:
# Look at the behavior of this estimator as a histogram
estA.hist(range=(1,np.max(estA['ests'])),bins=20)
In [32]:
# Computational thinking: estimator as a higher order function
# passed in to a function that creates a table of estimate
def estimate(estimator):
return Table([[estimator(samples[s]) for s in samples]],['ests'])
In [33]:
estB = estimate(estimateB)
In [34]:
estB.hist(range=(1,np.max(estB['ests'])),bins=20)
In [35]:
comp = Table([estA['ests'],estB['ests']],['estA','estB'])
In [36]:
comp
Out[36]:
In [37]:
comp.hist(overlay=True, bins=np.arange(1000,2500,50))
In [38]:
# How does these estimates compare with the true size of the population?
population.num_rows
Out[38]:
In [39]:
# Produce a table containing the data associated with a histogram
ebins = comp.bin(bins=np.arange(1000,2500,50))
ebins.show()
In [ ]: