Критерий | Одновыборочный | Двухвыборочный | Двухвыборочный (связанные выборки) |
---|---|---|---|
Знаков | $\times$ | $\times$ | |
Ранговый | $\times$ | $\times$ | $\times$ |
Перестановочный | $\times$ | $\times$ | $\times$ |
Имеются данные о продажной стоимости недвижимости в Сиэтле для 50 сделок в 2001 году и 50 в 2002. Изменились ли в среднем цены?
In [1]:
import numpy as np
import pandas as pd
import itertools
from scipy import stats
from statsmodels.stats.descriptivestats import sign_test
from statsmodels.stats.weightstats import zconfint
from statsmodels.stats.weightstats import *
In [2]:
%pylab inline
In [3]:
seattle_data = pd.read_csv('seattle.txt', sep = '\t', header = 0)
In [4]:
seattle_data.shape
Out[4]:
In [5]:
seattle_data.head()
Out[5]:
In [6]:
price2001 = seattle_data[seattle_data['Year'] == 2001].Price
price2002 = seattle_data[seattle_data['Year'] == 2002].Price
In [7]:
pylab.figure(figsize=(12,4))
pylab.subplot(1,2,1)
pylab.grid()
pylab.hist(price2001, color = 'r')
pylab.xlabel('2001')
pylab.subplot(1,2,2)
pylab.grid()
pylab.hist(price2002, color = 'b')
pylab.xlabel('2002')
pylab.show()
$H_0\colon$ медианы стоимости недвижимости в 2001 и 2002 годах совпадают
$H_1\colon$ медианы стоимости недвижимости в 2001 и 2002 годах не совпадают
In [8]:
print '95%% confidence interval for the mean: [%f, %f]' % zconfint(price2001)
In [9]:
print '95%% confidence interval for the mean: [%f, %f]' % zconfint(price2002)
$H_0\colon P(X > Y) = \frac1{2}$
$H_1\colon P(X > Y) ≠ \frac1{2}$
In [10]:
stats.mannwhitneyu(price2001, price2002)
Out[10]:
$H_0\colon F_{X_1}(x) = F_{X_2}(x)$
$H_1\colon F_{X_1}(x) = F_{X_2}(x + \Delta), \Delta\neq 0$
In [11]:
def permutation_t_stat_ind(sample1, sample2):
return np.mean(sample1) - np.mean(sample2)
In [12]:
def get_random_combinations(n1, n2, max_combinations):
index = range(n1 + n2)
indices = set([tuple(index)])
for i in range(max_combinations - 1):
np.random.shuffle(index)
indices.add(tuple(index))
return [(index[:n1], index[n1:]) for index in indices]
In [13]:
def permutation_zero_dist_ind(sample1, sample2, max_combinations = None):
joined_sample = np.hstack((sample1, sample2))
n1 = len(sample1)
n = len(joined_sample)
if max_combinations:
indices = get_random_combinations(n1, len(sample2), max_combinations)
else:
indices = [(list(index), filter(lambda i: i not in index, range(n))) \
for index in itertools.combinations(range(n), n1)]
distr = [joined_sample[list(i[0])].ьув() - joined_sample[list(i[1])].mean() \
for i in indices]
return distr
In [14]:
pylab.hist(permutation_zero_dist_ind(price2001, price2002, max_combinations = 1000))
pylab.show()
In [15]:
def permutation_test(sample, mean, max_permutations = None, alternative = 'two-sided'):
if alternative not in ('two-sided', 'less', 'greater'):
raise ValueError("alternative not recognized\n"
"should be 'two-sided', 'less' or 'greater'")
t_stat = permutation_t_stat_ind(sample, mean)
zero_distr = permutation_zero_dist_ind(sample, mean, max_permutations)
if alternative == 'two-sided':
return sum([1. if abs(x) >= abs(t_stat) else 0. for x in zero_distr]) / len(zero_distr)
if alternative == 'less':
return sum([1. if x <= t_stat else 0. for x in zero_distr]) / len(zero_distr)
if alternative == 'greater':
return sum([1. if x >= t_stat else 0. for x in zero_distr]) / len(zero_distr)
In [16]:
print "p-value: %f" % permutation_test(price2001, price2002, max_permutations = 10000)
In [17]:
print "p-value: %f" % permutation_test(price2001, price2002, max_permutations = 50000)