hypothesis : 수도권과 비수도권의 도서관 당 보유한 자료수의 distribution은 다르다
null hypothesis : 수도권과 비수도권의 도서관 당 보유한 자료수의 distribution은 같다
In [1]:
import json
with open('data1.json', 'r') as g:
data1 = json.load(g)
with open('data2.json', 'r') as g:
data2 = json.load(g)
In [2]:
cd code
In [3]:
from __future__ import print_function, division
import nsfg
import nsfg2
import first
import thinkstats2
import thinkplot
import copy
import random
import numpy as np
import matplotlib.pyplot as pyplot
In [4]:
class DiffMeansPermute(thinkstats2.HypothesisTest):
"""Tests a difference in means by permutation."""
def TestStatistic(self, data):
"""Computes the test statistic.
data: data in whatever form is relevant
"""
group1, group2 = data
test_stat = abs(group1.mean() - group2.mean())
return test_stat
def MakeModel(self):
"""Build a model of the null hypothesis.
"""
group1, group2 = self.data
self.n, self.m = len(group1), len(group2)
self.pool = np.hstack((group1, group2))
def RunModel(self):
"""Run the model of the null hypothesis.
returns: simulated data
"""
np.random.shuffle(self.pool)
data = self.pool[:self.n], self.pool[self.n:]
return data
In [5]:
class DiffMeansOneSided(DiffMeansPermute):
"""Tests a one-sided difference in means by permutation."""
def TestStatistic(self, data):
"""Computes the test statistic.
data: data in whatever form is relevant
"""
group1, group2 = data
test_stat = group1.mean() - group2.mean()
return test_stat
In [6]:
class DiffStdPermute(DiffMeansPermute):
"""Tests a one-sided difference in standard deviation by permutation."""
def TestStatistic(self, data):
"""Computes the test statistic.
data: data in whatever form is relevant
"""
group1, group2 = data
test_stat = group1.std() - group2.std()
return test_stat
In [7]:
def PrintTest(p_value, ht):
"""Prints results from a hypothesis test.
p_value: float
ht: HypothesisTest
"""
print('p-value =', p_value)
print('actual =', ht.actual)
print('ts max =', ht.MaxTestStat())
In [8]:
%matplotlib inline
In [9]:
def RunTests(data, iters=1000):
"""Runs several tests on the given data.
data: pair of sequences
iters: number of iterations to run
"""
# test the difference in means
ht = DiffMeansPermute(data)
p_value = ht.PValue(iters=iters)
print('\nmeans permute two-sided')
PrintTest(p_value, ht)
ht.PlotCdf()
thinkplot.Save(root='hypothesis1',
title='Permutation test',
xlabel='difference in means (books per library)',
ylabel='CDF',
legend=False)
# test the difference in means one-sided
ht = DiffMeansOneSided(data)
p_value = ht.PValue(iters=iters)
print('\nmeans permute one-sided')
PrintTest(p_value, ht)
# test the difference in std
ht = DiffStdPermute(data)
p_value = ht.PValue(iters=iters)
print('\nstd permute one-sided')
PrintTest(p_value, ht)
In [10]:
data_1 = np.array(data1)
data_2 = np.array(data2)
data = data_1, data_2
In [11]:
book = DiffMeansPermute(data)
RunTests(data)
two-sided에서의 pvalue가 0.05보다 크므로 null hypothesis 가 참이라고 할 수 있다.
(one-sided의 값들이 0.05에 가까워서 two-sided의 값으로 결론을 냈습니다.)
그러므로 hypothesis는 거짓이라고 할 수 있다
결론 : 수도권과 비수도권의 도서관 당 보유한 자료수의 distribution은 같다