In [6]:
import thinkstats2, thinkplot
import random, math
import numpy as np
Classical Hypothesis Testing - Given a sample and an apparent effect, what is the probability of seeing such an effect by chance?
Choosing a test statistic. e.g. difference in mean pregnancy length between two groups
Define null hypothesis - a model of the system based on the assumption that the apparent effect is not real.
Compute a p-value, the probability of seeing the apparent effict if the null hypothesis is true.
Interpret Stats. If p-value is low, the effect is statistically significant, i.e. unlikely to have occurred by chance, and likely to appear in the larger population. This doesn't necessarily mean that it is significant
In [7]:
class HypothesisTest(object):
def __init__(self, data):
self.data = data
self.MakeModel()
self.actual = self.TestStatistic(data)
def PValue(self, iters=1000):
self.test_stats = [self.TestStatistic(self.RunModel())
for _ in range(iters)]
count = sum(1 for x in self.test_stats if x >= self.actual)
return float(count) / iters
def TestStatistic(self, data):
raise UnimplementedMethodException()
def MakeModel(self):
pass
def RunModel(self):
raise UnimplementedMethodException()
In [8]:
class CoinTest(HypothesisTest):
def TestStatistic(self, data):
heads, tails = data
test_stat = abs(heads - tails)
return test_stat
def RunModel(self):
heads, tails = self.data
n = heads + tails
sample = [random.choice('HT') for _ in range(n)]
hist = thinkstats2.Hist(sample)
data = hist['H'], hist['T']
return data
ct = CoinTest((140, 110))
pvalue = ct.PValue()
In [9]:
pvalue
Out[9]:
by convention, a pvalue less than 5% is consitered significant. But, p-value depends on the choice of test stats and the model of the null hypothesis, so p-values not precise. less than 1% is unlikely to be due to chance. Greater than 10% plausibly due to chance. Between is grey area.
In [10]:
class DiffMeansPermute(HypothesisTest):
def TestStatistic(self, data):
group1, group2 = data
test_stat = abs(group1.mean() - group2.mean())
return test_stat
def MakeModel(self):
group1, group2 = self.data
self.n, self.m = len(group1), len(group2)
self.pool = np.hstack((group1, group2))
def RunModel(self):
np.random.shuffle(self.pool)
data = self.pool[:self.n], self.pool[self.n:]
return data
In [11]:
import first
live, firsts, others = first.MakeFrames()
data = firsts.prglngth.values, others.prglngth.values
ht = DiffMeansPermute(data)
pvalue = ht.PValue()
In [12]:
pvalue
Out[12]:
this exists:
ht.PlotCdf()
thinkplot.Show(xlabel='test statistic',
ylabel='CDF')
In [13]:
firsts_wgt = firsts.birthwgt_lb + (firsts.birthwgt_oz / 16)
others_wgt = others.birthwgt_lb + (others.birthwgt_oz / 16)
data2 = firsts_wgt, others_wgt
ht2 = DiffMeansPermute(data2)
pvalue2 = ht2.PValue()
pvalue2
Out[13]:
In [14]:
(firsts_wgt.mean() - others_wgt.mean()) * 16
Out[14]:
In [15]:
class DiffMeansOneSided(DiffMeansPermute):
def TestStatistic(self, data):
group1, group2 = data
test_stat = group1.mean() - group2.mean()
return test_stat
# rev_data = others.prglngth.values, firsts.prglngth.values
OS_ht = DiffMeansOneSided(data)
OS_pValue = OS_ht.PValue()
OS_pValue
Out[15]:
In [16]:
class DiffStdPermute(DiffMeansPermute):
def TestStatistic(self, data):
group1, group2 = data
test_stat = group1.std() - group2.std()
return test_stat
STD_ht = DiffStdPermute(data)
STD_pValue = STD_ht.PValue()
STD_pValue
Out[16]:
In [17]:
class CorrelationPermute(HypothesisTest):
def TestStatistic(self, data):
xs, ys = data
test_stat = abs(thinkstats2.Corr(xs, ys))
return test_stat
def RunModel(self):
xs, ys = self.data
xs = np.random.permutation(xs)
return xs, ys
In [19]:
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
data = live.agepreg.values, live.totalwgt_lb.values
ht = CorrelationPermute(data)
pvalue = ht.PValue()
pvalue
Out[19]:
In [28]:
class DiceTest(HypothesisTest):
def TestStatistic(self, data):
observed = data
n = sum(observed)
expected = np.ones(6) * n / 6.0
test_stat = sum(abs(observed - expected))
return test_stat
def RunModel(self):
n = sum(self.data)
values = [1,2,3,4,5,6]
rolls = np.random.choice(values, n, replace=True)
hist = thinkstats2.Hist(rolls)
freqs = hist.Freqs(values)
return freqs
dice_freqs = [8,9,19,5,8,11]
dt = DiceTest(dice_freqs)
pvalue = dt.PValue()
pvalue
Out[28]:
Question: why is the "expected"value for each die side not 10? Instead it is the sum of the frequencies divided by 6?*
Because n is the total number of dice rolls. expected is only ten if you roll the dice 60 times
In [29]:
class DiceChiTest(DiceTest):
def TestStatistic(self, data):
observed = data
n = sum(observed)
expected = np.ones(6) * n / 6
test_stat = sum((observed - expected)**2 / expected)
return test_stat
chi_dt = DiceChiTest(dice_freqs)
pvalue = chi_dt.PValue()
pvalue
Out[29]:
In [41]:
class PregLengthTest(HypothesisTest):
def MakeModel(self):
firsts, others = self.data
self.n = len(firsts)
self.pool = np.hstack((first, others))
pmf = thinkstats2.Pmf(self.pool)
self.values = range(35, 44)
self.expected_probs = np.array(pmf.Probs(self.values))
##Null hypothesis is that both samples are drawn from the same distribution
def RunModel(self):
np.random.shuffle(self.pool)
data = self.pool[:self.n], self.pool[self.n:]
return data
def TestStatistic(self, data):
firsts, others = data
stat = self.ChiSquared(firsts) + self.ChiSquared(others)
return stat
def ChiSquared(self, lengths):
hist = thinkstats2.Hist(lengths)
observed = np.array(hist.Freqs(self.values))
##expected_probs must be multiplied by length to get
##actual values
expected = self.expected_probs * len(lengths)
stat = sum((observed - expected)**2 / expected)
return stat
In [42]:
data = firsts.prglngth.values, others.prglngth.values
ht = PregLengthTest(data)
p_value = ht.PValue()
p_value
Out[42]:
The False positive rate for a p-value threshold of 5% is 1 out of 20, because p-value is the probability that a random value from the null hypothesis cdf reproduces an effect. If p-value is 5%, then test probabilities that exceed the 95th percentile will show up positive. This happens 5 percent of the time.
False negative rate depends on actual effect size. Can be computed by using observed samples as a model of the population and running hypothesis tests with simulated data...
In [46]:
def FalseNegRate(data, num_runs=100):
group1, group2 = data
count = 0.0
for i in range(num_runs):
##resample takes a sequence and draws a sample with
##the same length with replacement
sample1 = thinkstats2.Resample(group1)
sample2 = thinkstats2.Resample(group2)
ht = DiffMeansPermute((sample1, sample2))
pvalue = ht.PValue(iters=101)
if pvalue > 0.05:
count += 1
return count / num_runs
neg_rate = FalseNegRate(data)
neg_rate
Out[46]:
power of a test - This means that, if the actual difference in mean pregnancy is 0.078 weeks, we expect an experiment with this sample size to yield a positive test only 30% of the time. In general, if power of 80% is considered acceptable--anything less suggests the difference is too small to detect with this sample size.
partitioning use one set for exploration and the other for testing.
In [28]:
def SampleSize(df, size):
sampled_df = thinkstats2.SampleRows(df, size, replace=True)
return sampled_df
In [70]:
import estimation
def testError(df1, df2, hTest, sampleFrac):
pvs = []
for i in range(25):
data1 = SampleSize(df1, len(df1) / sampleFrac)
data2 = SampleSize(df2, len(df2) / sampleFrac)
data = data1.values, data2.values
ht = hTest(data)
pValue = ht.PValue()
pvs.append(pValue)
return reduce(lambda x, y: x + y, pvs) / len(pvs)
# live, firsts, others = first.MakeFrames()
# data = firsts.prglngth.values, others.prglngth.values
# ht = DiffMeansPermute(data)
# pvalue = ht.PValue()
In [88]:
means = []
for i in np.arange(50, 1, -5):
mean = testError(data2[0], data2[1], DiffMeansPermute, i)
print i, mean
means.append(mean)
first positive p-test I got was at sample size of 4418 / 25. This experiment clearly showed that the power of the experiment--i.e. the rate that a positive is detected when it exists--greatly increases with increasing sample size. Or at least I hypothesize that it does, and if I ran a correlation test, I think it would come up positive, and if I ran a p-test on that supposed correlation, I think it would also stand up to random variations.
In [112]:
class DiffMeansResample(DiffMeansPermute):
def RunModel(self):
cdf = thinkstats2.Cdf(self.pool)
group1 = np.random.choice(self.pool, self.n, replace=True)
group2 = np.random.choice(self.pool, self.m, replace=True)
data = group1, group2
return data
In [113]:
data = live.totalwgt_lb.values, live.prglngth.values
ht = DiffMeansResample(data)
pvalue = ht.PValue()
print 'weight vs preglength',pvalue
data = firsts.totalwgt_lb.values, others.totalwgt_lb.values
ht = DiffMeansResample(data)
pvalue = ht.PValue()
print 'first v others weight', pvalue
data = firsts.prglngth.values, others.prglngth.values
ht = DiffMeansResample(data)
pvalue = ht.PValue()
print 'first v others, preglength', pvalue
In [ ]: