In [1]:
import matplotlib.font_manager
myfnt = matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')
print myfnt[:10] # 100개만 출력하기
matplotlib.rc('font', family='HYSANB') # 한글 폰트 설정
In [2]:
import thinkstats2
import thinkplot
import nsfg
df = nsfg.ReadFemPreg()
#preg = nsfg.ReadFemPreg()
In [3]:
hist = thinkstats2.Hist(df.birthwgt_lb, label = 'birthwgt_lb')
thinkplot.Hist(hist)
ko_xlab = unicode('값','utf-8')
ko_ylab = unicode('빈도','utf-8')
thinkplot.Show(xlabel=ko_xlab, ylabel=ko_ylab)
In [21]:
hist = thinkstats2.Hist(df.birthwgt_oz, label = 'birthwgt_oz')
thinkplot.Hist(hist)
ko_xlab = unicode('값','utf-8')
ko_ylab = unicode('빈도','utf-8')
thinkplot.Show(xlabel=ko_xlab, ylabel=ko_ylab)
In [22]:
hist = thinkstats2.Hist(df.agepreg, label = 'agepreg')
thinkplot.Hist(hist)
ko_xlab = unicode('나이','utf-8')
ko_ylab = unicode('빈도','utf-8')
thinkplot.Show(xlabel=ko_xlab, ylabel=ko_ylab)
In [30]:
preg = nsfg.ReadFemPreg()
live = preg[preg.outcome == 1]
hist = thinkstats2.Hist(live.prglngth, label = 'prglngth')
thinkplot.Hist(hist)
ko_xlab = unicode('파운드','utf-8')
ko_ylab = unicode('빈도','utf-8')
thinkplot.Show(xlabel=ko_xlab, ylabel=ko_ylab)
In [52]:
first = live[live.birthord == 1]
others = live[live.birthord != 1]
first_hist = thinkstats2.Hist(first.prglngth)
others_hist = thinkstats2.Hist(others.prglngth)
width = 0.45
thinkplot.PrePlot(2)
thinkplot.Hist(first_hist, align='right', width=width, label=unicode('첫째','utf-8'))
thinkplot.Hist(others_hist, align='left', width=width, label=unicode('첫째아님','utf-8'))
ko_xlab = unicode('임신주차','utf-8')
ko_ylab = unicode('빈도','utf-8')
thinkplot.Show(xlabel=ko_xlab, ylabel=ko_ylab)
In [10]:
import thinkstats2
pmf = thinkstats2.Pmf([1,2,2,3,5])
pmf
pmf.Prob(2)
pmf.Normalize()
pmf.Total()
Out[10]:
In [22]:
width = 0.45
preg = nsfg.ReadFemPreg() # 전체 임신 정보 가져오기
live = preg[preg.outcome ==1] # 정상출산인 경우만 가져오기
first = live[live.birthord == 1] # 첫째 아기 정보 가져오기
others = live[live.birthord != 1] # 첫째 아닌 아기 정보 가져오기
### PMF 생성하기
first_pmf = thinkstats2.Pmf(first.prglngth)
others_pmf = thinkstats2.Pmf(others.prglngth)
### 그래프로 도식화하기
thinkplot.PrePlot(2, cols=2)
thinkplot.Hist(first_pmf, align='right', width=width, label=unicode('첫째','utf-8'))
thinkplot.Hist(others_pmf, align='left', width=width, label=unicode('첫째아님','utf-8'))
thinkplot.Config(xlabel=unicode('임신주차','utf-8'),
ylabel=unicode('확률','utf-8'),
axis=[27, 46, 0, 0.6],
legend=)
thinkplot.PrePlot(2)
thinkplot.SubPlot(2)
thinkplot.Pmfs([first_pmf, others_pmf])
thinkplot.Show(xlabel=unicode('임신주차','utf-8'),
axis=[27,46,0,0.6])
In [ ]:
weeks = range(35, 46)
diffs = []
for week in weeks:
p1 = first_pmf.Prob(week)
p2 = others_pmf.Prob(week)
diff = 100 * (p1 -p2)
diffs.append(diff)
thinkplot.Bar(weeks, diffs)
thinkplot.Show(xlabel=unicode('임신주차','utf-8'), ylabel=unicode('백분율점','utf-8'))
In [ ]:
import thinkstats2
d = { 7: 8, 12: 8, 17: 14, 22: 4, 27: 6, 32: 12, 37: 8, 42: 3, 47: 2 }
pmf = thinkstats2.Pmf(d, label=unicode('실제값','utf-8'))
print('mean', pmf.Mean())
def BiasPmf(pmf, label):
new_pmf = pmf.Copy(label=label)
for x, p in pmf.Items():
new_pmf.Mult(x, x)
new_pmf.Normalize()
return new_pmf
biased_pmf = BiasPmf(pmf, label=unicode('관측값','utf-8'))
In [ ]:
thinkplot.PrePlot(2)
thinkplot.Pmfs([pmf, biased_pmf])
thinkplot.Show(xlabel=unicode('학급크기','utf-8'), ylabel='PMF')
In [13]:
preg = nsfg.ReadFemPreg() # 전체 임신 정보 가져오기
live = preg[preg.outcome ==1] # 정상출산인 경우만 가져오기
first = live[live.birthord == 1] # 첫째 아기 정보 가져오기
others = live[live.birthord != 1] # 첫째 아닌 아기 정보 가져오기
### PMF 생성하기
first_pmf = thinkstats2.Hist(first.birthwgt_lb)
others_pmf = thinkstats2.Hist(others.birthwgt_lb)
width = 0.45
thinkplot.PrePlot(2)
thinkplot.Hist(first_pmf, align='right', width=width, label=unicode('첫째','utf-8'))
thinkplot.Hist(others_pmf, align='left', width=width, label=unicode('첫째아님','utf-8'))
ko_xlab = unicode('임신주차','utf-8')
ko_ylab = unicode('빈도','utf-8')
thinkplot.Show(xlabel=ko_xlab, ylabel=ko_ylab)
In [62]:
data = [1,2,2,3,5]
thinkplot.Config(xlabel=unicode('x','utf-8'),
ylabel=unicode('CDF','utf-8'),
axis=[0, 6, 0, 1])
cdf = thinkstats2.Cdf(data, label='prglngth')
thinkplot.Cdf(cdf)
thinkplot.Show(xlabel='x', ylabel='CDF')
In [7]:
cdf = thinkstats2.Cdf(live.prglngth, label=unicode('임신기간','utf-8'))
thinkplot.Cdf(cdf)
thinkplot.Show(xlabel=unicode('임신주차','utf-8'), ylabel='CDF')
In [12]:
first_cdf = thinkstats2.Cdf(first.totalwgt_lb, label=unicode('첫째','utf-8'))
others_cdf = thinkstats2.Cdf(others.totalwgt_lb, label=unicode('첫째아님','utf-8'))
thinkplot.PrePlot(2)
thinkplot.Cdfs([first_cdf,others_cdf])
thinkplot.Show(xlabel=unicode('출생체중(파운드)','utf-8'), ylabel='CDF')
In [21]:
import numpy as np
weights = live.totalwgt_lb
cdf = thinkstats2.Cdf(weights, label='totalwgt_lb')
sample = np.random.choice(weights, 100, replace=True)
ranks = [cdf.PercentileRank(x) for x in sample]
rank_cdf = thinkstats2.Cdf(ranks, label=unicode('백분위 순위','utf-8'))
thinkplot.Cdf(rank_cdf)
thinkplot.Show(xlabel=unicode('백분위 순위','utf-8'), ylabel='CDF')
In [36]:
#%run analytic.py
thinkplot.PrePlot(3)
xs, ps = thinkstats2.RenderExpoCdf(0.5, 0, 3.0, 50)
label = r'$\lambda=%g$' % 0.5
thinkplot.Plot(xs, ps, label=label)
xs, ps = thinkstats2.RenderExpoCdf(2.0, 0, 3.0, 50)
label = r'$\lambda=%g$' % 2.0
thinkplot.Plot(xs, ps, label=label)
xs, ps = thinkstats2.RenderExpoCdf(1.0, 0, 3.0, 50)
label = r'$\lambda=%g$' % 1.0
thinkplot.Plot(xs, ps, label=label)
thinkplot.Show()
In [48]:
df = ReadBabyBoom()
diffs = df.minutes.diff()
cdf = thinkstats2.Cdf(diffs, label='actual')
thinkplot.PrePlot(cols=2)
thinkplot.Config(xlabel=unicode('분(min.)','utf-8'),
ylabel='CDF',
legend=False)
thinkplot.Cdf(cdf)
thinkplot.SubPlot(2)
thinkplot.Config(xlabel=unicode('분(min.)','utf-8'),
ylabel='CCDF',
legend=False,
yscale='log')
thinkplot.Cdf(cdf, complement=True)
thinkplot.Show()
In [7]:
thinkplot.PrePlot(3)
mus = [1.0, 2.0, 3.0]
sigmas = [0.5, 0.4, 0.3]
xs, ps = thinkstats2.RenderNormalCdf(mu=mus[0], sigma=sigmas[0], low=-1.0, high=4.0)
label = r'$\mu=%g$, $\sigma=%g$' % (mus[0], sigmas[0])
thinkplot.Plot(xs, ps, label=label)
xs, ps = thinkstats2.RenderNormalCdf(mu=mus[1], sigma=sigmas[1], low=-1.0, high=4.0)
label = r'$\mu=%g$, $\sigma=%g$' % (mus[1], sigmas[1])
thinkplot.Plot(xs, ps, label=label)
xs, ps = thinkstats2.RenderNormalCdf(mu=mus[2], sigma=sigmas[2], low=-1.0, high=4.0)
label = r'$\mu=%g$, $\sigma=%g$' % (mus[2], sigmas[2])
thinkplot.Plot(xs, ps, label=label)
thinkplot.Show(xlabel='x', ylabel='CDF', title=unicode('가우스 CDF','utf-8'))
In [64]:
thinkplot.PrePlot(1)
# estimate parameters: trimming outliers yields a better fit
preg = nsfg.ReadFemPreg()
weights = preg.totalwgt_lb.dropna()
mu, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
print('Mean, Var', mu, var)
sigma = math.sqrt(var)
print('Sigma', sigma)
xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=12.5)
thinkplot.Plot(xs, ps, label=unicode('모형','utf-8'), color='0.8')
# plot the data
cdf = thinkstats2.Cdf(weights, label='data')
thinkplot.Cdf(cdf, label=unicode('데이터','utf-8'))
thinkplot.Show(xlabel=unicode('출생체중(파운드)','utf-8'), ylabel='CDF', title=unicode('출생체중','utf-8'))
In [41]:
n=1000
mus = [0, 1, 5]
sigmas = [1, 1, 2]
sample = np.random.normal(mus[0], sigmas[0], n)
xs, ys = thinkstats2.NormalProbability(sample)
label = '$\mu=%d$, $\sigma=%d$' % (mus[0], sigmas[0])
thinkplot.Plot(xs, ys, label=label)
sample = np.random.normal(mus[1], sigmas[1], n)
xs, ys = thinkstats2.NormalProbability(sample)
label = '$\mu=%d$, $\sigma=%d$' % (mus[1], sigmas[1])
thinkplot.Plot(xs, ys, label=label)
sample = np.random.normal(mus[2], sigmas[2], n)
xs, ys = thinkstats2.NormalProbability(sample)
label = '$\mu=%d$, $\sigma=%d$' % (mus[2], sigmas[2])
thinkplot.Plot(xs, ys, label=label)
thinkplot.Show(legend=True)
In [70]:
preg = nsfg.ReadFemPreg()
full_term = preg[preg.prglngth >= 37]
weights = preg.totalwgt_lb.dropna()
term_weights = full_term.totalwgt_lb.dropna()
mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
std = math.sqrt(var)
xs = [-4, 4]
fxs, fys = thinkstats2.FitLine(xs, mean, std)
thinkplot.Plot(fxs, fys, linewidth=4, color='0.8')
thinkplot.PrePlot(2)
xs, ys = thinkstats2.NormalProbability(weights)
thinkplot.Plot(xs, ys, label=unicode('모두생존(all live)','utf-8'))
xs, ys = thinkstats2.NormalProbability(term_weights)
thinkplot.Plot(xs, ys, label=unicode('만삭(full-term)','utf-8'))
thinkplot.Show(xlabel=unicode('평균으로부터 표준편차','utf-8'), ylabel=unicode('출생체중(파운드)','utf-8'), title=unicode('정규확률그림','utf-8'))
In [13]:
df = ReadBrfss(nrows=1000)
weights = df.wtkg2.dropna()
log_weights = np.log10(weights)
# plot weights on linear and log scales
thinkplot.PrePlot(cols=2)
MakeNormalModel(weights)
thinkplot.Config(xlabel=unicode('성인체중 (kg)','utf-8'), ylabel='CDF')
thinkplot.SubPlot(2)
MakeNormalModel(log_weights)
thinkplot.Config(xlabel=unicode('성인체중 (log10 kg)','utf-8'))
thinkplot.Show()
In [10]:
# make normal probability plots on linear and log scales
thinkplot.PrePlot(cols=2)
MakeNormalPlot(weights)
thinkplot.Config(xlabel='z', ylabel=unicode('체중 (kg)','utf-8'))
thinkplot.SubPlot(2)
MakeNormalPlot(log_weights)
thinkplot.Config(xlabel='z', ylabel=unicode('체중 (log10 kg)','utf-8'))
thinkplot.Show()
In [16]:
thinkplot.PrePlot(3)
xmin = 0.5
alpha = [2.0, 1.0, 0.5]
xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha[0], 0, 10.0, n=100)
thinkplot.Plot(xs, ps, label=r'$\alpha=%g$' % alpha[0])
xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha[1], 0, 10.0, n=100)
thinkplot.Plot(xs, ps, label=r'$\alpha=%g$' % alpha[1])
xs, ps = thinkstats2.RenderParetoCdf(xmin, alpha[2], 0, 10.0, n=100)
thinkplot.Plot(xs, ps, label=r'$\alpha=%g$' % alpha[2])
thinkplot.Show(title=unicode('파레토 CDF','utf-8'), xlabel='x', ylabel='CDF')
In [21]:
# % run populations.py
pops = ReadData()
log_pops = np.log10(pops)
cdf = thinkstats2.Cdf(pops, label='data')
cdf_log = thinkstats2.Cdf(log_pops, label='data')
# pareto plot
xs, ys = thinkstats2.RenderParetoCdf(xmin=5000, alpha=1.4, low=0, high=1e7)
thinkplot.Plot(np.log10(xs), 1-ys, label='model', color='0.8')
thinkplot.Cdf(cdf_log, complement=True)
thinkplot.Config(xlabel=unicode('log10 인구','utf-8'),
ylabel='CCDF',
yscale='log')
thinkplot.Show()
In [ ]:
thinkplot.PrePlot(cols=2)
mu, sigma = log_pops.mean(), log_pops.std()
xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=8)
#thinkplot.Plot(xs, ps, label=unicode('모형','utf-8'), color='0.8')
thinkplot.Plot(xs, ps, label='model', color='0.8')
thinkplot.Cdf(cdf_log)
thinkplot.Config(xlabel=unicode('log10 인구','utf-8'),
ylabel='CDF')
thinkplot.SubPlot(2)
thinkstats2.NormalProbabilityPlot(log_pops, label='data')
thinkplot.Config(xlabel='z',
ylabel=unicode('log10 인구','utf-8'),
xlim=[-5, 5])
thinkplot.Show()
In [15]:
# % run density.py
import math
import random
n=1000
mean, var = 163, 52.8
std = math.sqrt(var)
# make a PDF and compute a density, FWIW
pdf = thinkstats2.NormalPdf(mean, std)
print(pdf.Density(mean + std))
# make a PMF and plot it
thinkplot.PrePlot(2)
thinkplot.Pdf(pdf, label=unicode('정규분포','utf-8'))
# make a sample, make an estimated PDF, and plot it
sample = [random.gauss(mean, std) for _ in range(n)]
sample_pdf = thinkstats2.EstimatedPdf(sample)
thinkplot.Pdf(sample_pdf, label=unicode('표본 KDE','utf-8'))
thinkplot.Show(xlabel=unicode('신장(cm)','utf-8'), ylabel=unicode('밀도(density)','utf-8'))
In [27]:
def VertLine(x, y):
thinkplot.Plot([x, x], [0, y], color='0.6', linewidth=1)
live, firsts, others = first.MakeFrames()
data = live.totalwgt_lb.dropna()
print(unicode('출생체중','utf-8'))
mean, median = Summarize(data)
y = 0.35
VertLine(mean, y)
thinkplot.Text(mean-0.5, 0.1*y, unicode('평균','utf-8'), horizontalalignment='right')
VertLine(median, y)
thinkplot.Text(median+2, 0.1*y, unicode('중위수','utf-8'), horizontalalignment='right')
pdf = thinkstats2.EstimatedPdf(data)
thinkplot.Pdf(pdf, label=unicode('출생체중','utf-8'))
thinkplot.Show(xlabel=unicode('파운드','utf-8'), ylabel='PDF')
In [35]:
df = brfss.ReadBrfss(nrows=None)
data = df.wtkg2.dropna()
mean, median = Summarize(data)
y = 0.02499
VertLine(mean, y)
thinkplot.Text(mean+15, 0.1*y, unicode('평균','utf-8'), horizontalalignment='right')
VertLine(median, y)
thinkplot.Text(median-1.5, 0.1*y, unicode('중위수','utf-8'), horizontalalignment='right')
pdf = thinkstats2.EstimatedPdf(data)
thinkplot.Pdf(pdf, label=unicode('성인체중','utf-8'))
thinkplot.Show(xlabel=unicode('킬로그램','utf-8'), ylabel='PDF', xlim=[0,200])
In [69]:
#% run scatter.py
df = brfss.ReadBrfss(nrows=None)
df = df.dropna(subset=['htm3', 'wtkg2'])
def GetHeightWeight(df, hjitter=0.0, wjitter=0.0):
"""Get sequences of height and weight.
df: DataFrame with htm3 and wtkg2
hjitter: float magnitude of random noise added to heights
wjitter: float magnitude of random noise added to weights
returns: tuple of sequences (heights, weights)
"""
heights = df.htm3
if hjitter:
heights = thinkstats2.Jitter(heights, hjitter)
weights = df.wtkg2
if wjitter:
weights = thinkstats2.Jitter(weights, wjitter)
return heights, weights
sample = thinkstats2.SampleRows(df, 5000)
# simple scatter plot
thinkplot.PrePlot(cols=2)
heights, weights = GetHeightWeight(sample)
ScatterPlot(heights, weights)
thinkplot.Config(xlabel=unicode('신장(cm)','utf-8'), ylabel=unicode('체중(kg)','utf-8'))
# scatter plot with jitter
thinkplot.SubPlot(2)
heights, weights = GetHeightWeight(sample, hjitter=1.3, wjitter=0.5)
ScatterPlot(heights, weights)
thinkplot.Show(xlabel=unicode('신장(cm)','utf-8'), ylabel=unicode('체중(kg)','utf-8'))
In [51]:
thinkplot.PrePlot(cols=2)
ScatterPlot(heights, weights, alpha=0.005)
thinkplot.Config(xlabel=unicode('신장(cm)','utf-8'), ylabel=unicode('체중(kg)','utf-8'))
# hexbin plot
thinkplot.SubPlot(2)
heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=0.5)
HexBin(heights, weights)
thinkplot.Show(xlabel=unicode('신장(cm)','utf-8'), ylabel=unicode('체중(kg)','utf-8'))
In [68]:
def BinnedPercentiles(df):
"""Bin the data by height and plot percentiles of weight for eachbin.
df: DataFrame
"""
cdf = thinkstats2.Cdf(df.htm3)
print('Fraction between 140 and 200 cm', cdf[200] - cdf[140])
bins = np.arange(135, 210, 5)
indices = np.digitize(df.htm3, bins)
groups = df.groupby(indices)
heights = [group.htm3.mean() for i, group in groups][1:-1]
cdfs = [thinkstats2.Cdf(group.wtkg2) for i, group in groups][1:-1]
thinkplot.PrePlot(3)
for percent in [75, 50, 25]:
weights = [cdf.Percentile(percent) for cdf in cdfs]
label = '%dth' % percent
thinkplot.Config(label=label, xlabel=unicode('신장(cm)','utf-8'), ylabel=unicode('체중(kg)','utf-8'))
thinkplot.Plot(heights, weights)
thinkplot.Show()
BinnedPercentiles(df)
In [73]:
# %run estimation.py
def SimulateSample(mu=90, sigma=7.5, n=9, m=1000):
"""Plots the sampling distribution of the sample mean.
mu: hypothetical population mean
sigma: hypothetical population standard deviation
n: sample size
m: number of iterations
"""
def VertLine(x, y=1):
thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3)
means = []
for _ in range(m):
xs = np.random.normal(mu, sigma, n)
xbar = np.mean(xs)
means.append(xbar)
stderr = RMSE(means, mu)
print('standard error', stderr)
cdf = thinkstats2.Cdf(means)
ci = cdf.Percentile(5), cdf.Percentile(95)
print('confidence interval', ci)
VertLine(ci[0])
VertLine(ci[1])
# plot the CDF
thinkplot.Cdf(cdf)
thinkplot.Show(xlabel=unicode('표본평균','utf-8'),
ylabel='CDF',
title=unicode('표집분포','utf-8'))
SimulateSample(mu=90, sigma=7.5, n=9, m=1000)
In [78]:
#% run hypothesis.py
def RunTests(data, iters=1000):
"""Runs several tests on the given data.
data: pair of sequences
iters: number of iterations to run
"""
# test the difference in means
ht = DiffMeansPermute(data)
p_value = ht.PValue(iters=iters)
print('\nmeans permute two-sided')
PrintTest(p_value, ht)
ht.PlotCdf()
thinkplot.Show(title=unicode('순열검정','utf-8'),
xlabel=unicode('평균 차이(주)','utf-8'),
ylabel='CDF',
legend=False,
xlim=[0,0.25])
# test the difference in means one-sided
ht = DiffMeansOneSided(data)
p_value = ht.PValue(iters=iters)
print('\nmeans permute one-sided')
PrintTest(p_value, ht)
# test the difference in std
ht = DiffStdPermute(data)
p_value = ht.PValue(iters=iters)
print('\nstd permute one-sided')
PrintTest(p_value, ht)
live, firsts, others = first.MakeFrames()
data = firsts.prglngth.values, others.prglngth.values
RunTests(data)
In [91]:
import first
live, _, _ = first.MakeFrames()
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
def PlotFit(live):
"""Plots a scatter plot and fitted curve.
live: DataFrame
"""
ages = live.agepreg
weights = live.totalwgt_lb
inter, slope = thinkstats2.LeastSquares(ages, weights)
fit_xs, fit_ys = thinkstats2.FitLine(ages, inter, slope)
thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)
thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2)
thinkplot.Show(xlabel=unicode('연령(년)','utf-8'),
ylabel=unicode('출생체중(파운드)','utf-8'),
axis=[10, 45, 0, 15],
legend=False)
PlotFit(live)
In [93]:
def PlotResiduals(live):
"""Plots percentiles of the residuals.
live: DataFrame
"""
ages = live.agepreg
weights = live.totalwgt_lb
inter, slope = thinkstats2.LeastSquares(ages, weights)
live['residual'] = thinkstats2.Residuals(ages, weights, inter, slope)
bins = np.arange(10, 48, 3)
indices = np.digitize(live.agepreg, bins)
groups = live.groupby(indices)
ages = [group.agepreg.mean() for _, group in groups][1:-1]
cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1]
thinkplot.PrePlot(3)
for percent in [75, 50, 25]:
weights = [cdf.Percentile(percent) for cdf in cdfs]
label = '%dth' % percent
thinkplot.Plot(ages, weights, label=label)
thinkplot.Show(xlabel=unicode('연령(년)','utf-8'),
ylabel=unicode('잔차(파운드)','utf-8'),
xlim=[10, 45],
legend=True)
PlotResiduals(live)
In [87]:
#% run linear.py
def PlotSamplingDistributions(live):
"""Plots confidence intervals for the fitted curve and sampling dists.
live: DataFrame
"""
ages = live.agepreg
weights = live.totalwgt_lb
inter, slope = thinkstats2.LeastSquares(ages, weights)
res = thinkstats2.Residuals(ages, weights, inter, slope)
r2 = thinkstats2.CoefDetermination(weights, res)
print('rho', thinkstats2.Corr(ages, weights))
print('R2', r2)
print('R', math.sqrt(r2))
print('Std(ys)', thinkstats2.Std(weights))
print('Std(res)', thinkstats2.Std(res))
# plot the confidence intervals
inters, slopes = SamplingDistributions(live, iters=1001)
PlotConfidenceIntervals(ages, inters, slopes, percent=90,
alpha=0.3, label='90% CI')
thinkplot.Text(42, 7.53, '90%')
PlotConfidenceIntervals(ages, inters, slopes, percent=50,
alpha=0.5, label='50% CI')
thinkplot.Text(42, 7.59, '50%')
thinkplot.Show(xlabel=unicode('연령(년)','utf-8'),
ylabel=unicode('출생체중(파운드)','utf-8'),
legend=False)
PlotSamplingDistributions(live)
In [97]:
def PlotSamplingDistributions(live):
"""Plots confidence intervals for the fitted curve and sampling dists.
live: DataFrame
"""
ages = live.agepreg
weights = live.totalwgt_lb
inter, slope = thinkstats2.LeastSquares(ages, weights)
res = thinkstats2.Residuals(ages, weights, inter, slope)
r2 = thinkstats2.CoefDetermination(weights, res)
print('rho', thinkstats2.Corr(ages, weights))
print('R2', r2)
print('R', math.sqrt(r2))
print('Std(ys)', thinkstats2.Std(weights))
print('Std(res)', thinkstats2.Std(res))
inters, slopes = SamplingDistributions(live, iters=1001)
# plot the sampling distribution of slope under null hypothesis
# and alternate hypothesis
sampling_cdf = thinkstats2.Cdf(slopes)
print('p-value, sampling distribution', sampling_cdf[0])
ht = SlopeTest((ages, weights))
pvalue = ht.PValue()
print('p-value, slope test', pvalue)
print('inter', inter, thinkstats2.Mean(inters))
Summarize(inters, inter)
print('slope', slope, thinkstats2.Mean(slopes))
Summarize(slopes, slope)
thinkplot.PrePlot(2)
thinkplot.Plot([0, 0], [0, 1], color='0.8')
ht.PlotCdf(label=unicode('귀무가설','utf-8'))
thinkplot.Cdf(sampling_cdf, label=unicode('표집분포','utf-8'))
thinkplot.Show(xlabel=unicode('기울기(파운드/년)','utf-8'),
ylabel='CDF',
xlim=[-0.03, 0.03],
loc='upper left')
PlotSamplingDistributions(live)
In [103]:
# % run timeseries.py
def ReadData():
"""Reads data about cannabis transactions.
http://zmjones.com/static/data/mj-clean.csv (더이상 제공하고 있지 않음)
https://raw.githubusercontent.com/AllenDowney/ThinkStats2/master/code/mj-clean.csv
returns: DataFrame
"""
transactions = pandas.read_csv('mj-clean.csv', parse_dates=[5])
return transactions
transactions = ReadData()
dailies = GroupByQualityAndDay(transactions)
def PlotDailies(dailies):
"""Makes a plot with daily prices for different qualities.
dailies: map from name to DataFrame
"""
kor_name = ['고급', '중급', '저급']
thinkplot.PrePlot(rows=3)
for i, (name, daily) in enumerate(dailies.items()):
thinkplot.SubPlot(i+1)
title = unicode('그램당 가격($)','utf-8') if i == 0 else ''
thinkplot.Config(ylim=[0, 20], title=title)
thinkplot.Scatter(daily.ppg, s=10, label= unicode(kor_name[i],'utf-8'))
if i == 2:
pyplot.xticks(rotation=30)
else:
thinkplot.Config(xticks=[])
thinkplot.Show(
formats=FORMATS)
PlotDailies(dailies)
In [106]:
name = 'high'
daily = dailies[name]
def PlotLinearModel(daily, name):
"""Plots a linear fit to a sequence of prices, and the residuals.
daily: DataFrame of daily prices
name: string
"""
model, results = RunLinearModel(daily)
PlotFittedValues(model, results, label=unicode('고급','utf-8'))
thinkplot.Show(
title=unicode('적합된 값','utf-8') ,
xlabel=unicode('연도','utf-8'),
xlim=[-0.1, 3.8],
ylabel=unicode('그램당 가격($)','utf-8'),
formats=FORMATS)
PlotLinearModel(daily, name)
In [109]:
def PlotRollingMean(daily, name):
"""Plots rolling mean and EWMA.
daily: DataFrame of daily prices
"""
dates = pandas.date_range(daily.index.min(), daily.index.max())
reindexed = daily.reindex(dates)
thinkplot.PrePlot(cols=2)
thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=unicode('고급','utf-8'))
roll_mean = pandas.rolling_mean(reindexed.ppg, 30)
thinkplot.Plot(roll_mean, label=unicode('이동평균','utf-8'))
pyplot.xticks(rotation=30)
thinkplot.Config(ylabel=unicode('그램당 가격($)','utf-8'))
thinkplot.SubPlot(2)
thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=unicode('고급','utf-8'))
ewma = pandas.ewma(reindexed.ppg, span=30)
thinkplot.Plot(ewma, label='EWMA')
pyplot.xticks(rotation=30)
thinkplot.Show(
formats=FORMATS)
PlotRollingMean(daily, name)
In [111]:
def PlotFilled(daily, name):
"""Plots the EWMA and filled data.
daily: DataFrame of daily prices
"""
filled = FillMissing(daily, span=30)
thinkplot.Scatter(filled.ppg, s=15, alpha=0.3, label=unicode('고급','utf-8'))
thinkplot.Plot(filled.ewma, label='EWMA', alpha=0.4)
pyplot.xticks(rotation=30)
thinkplot.Show(
ylabel=unicode('그램당 가격($)','utf-8'),
formats=FORMATS)
PlotFilled(daily, name)
In [113]:
def MakeAcfPlot(dailies):
"""Makes a figure showing autocorrelation functions.
dailies: map from category name to DataFrame of daily prices
"""
axis = [0, 41, -0.2, 0.2]
thinkplot.PrePlot(cols=2)
PlotAutoCorrelation(dailies, add_weekly=False)
thinkplot.Config(axis=axis,
loc='lower right',
ylabel=unicode('상관','utf-8'),
xlabel=unicode('시차(일별)','utf-8'))
thinkplot.SubPlot(2)
PlotAutoCorrelation(dailies, add_weekly=True)
thinkplot.Show(
axis=axis,
loc='lower right',
xlabel=unicode('시차(일별)','utf-8'),
formats=FORMATS)
MakeAcfPlot(dailies)
In [115]:
years = np.linspace(0, 5, 101)
thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=unicode('고급','utf-8'))
PlotPredictions(daily, years)
xlim = years[0]-0.1, years[-1]+0.1
thinkplot.Show(
title=unicode('예측','utf-8'),
xlabel=unicode('년도','utf-8'),
xlim=xlim,
ylabel=unicode('그램당 가격($)','utf-8'),
formats=FORMATS)
In [117]:
name = 'medium'
daily = dailies[name]
thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=unicode('중급','utf-8'))
PlotIntervals(daily, years)
PlotPredictions(daily, years)
xlim = years[0]-0.1, years[-1]+0.1
thinkplot.Show(
title=unicode('예측','utf-8'),
xlabel=unicode('년도','utf-8'),
xlim=xlim,
ylabel=unicode('그램당 가격($)','utf-8'),
formats=FORMATS)
In [122]:
# %run survival.py
def PlotPregnancyData(preg):
"""Plots survival and hazard curves based on pregnancy lengths.
preg:
"""
complete = preg.query('outcome in [1, 3, 4]').prglngth
print('Number of complete pregnancies', len(complete))
ongoing = preg[preg.outcome == 6].prglngth
print('Number of ongoing pregnancies', len(ongoing))
PlotSurvival(complete)
thinkplot.Show(
xlabel=unicode('시간(주)','utf-8'),
formats=FORMATS,
legend=True,
loc='upper left')
PlotPregnancyData(preg)
In [124]:
def ReadFemResp2002():
"""Reads respondent data from NSFG Cycle 6.
returns: DataFrame
"""
usecols = ['cmmarrhx', 'cmdivorcx', 'cmbirth', 'cmintvw',
'evrmarry', 'finalwgt']
resp = ReadFemResp(usecols=usecols)
CleanData(resp)
return resp
resp6 = ReadFemResp2002()
def PlotMarriageData(resp):
"""Plots hazard and survival functions.
resp: DataFrame of respondents
"""
hf, sf = EstimateSurvival(resp)
thinkplot.PrePlot(rows=2)
thinkplot.Plot(hf)
thinkplot.Config(legend=False)
thinkplot.SubPlot(2)
thinkplot.Plot(sf)
thinkplot.Show(
xlabel=unicode('연령(년)','utf-8'),
ylabel=unicode('미혼 확률','utf-8'),
ylim=[0, 1],
legend=False,
formats=FORMATS)
PlotMarriageData(resp6)
In [128]:
def ResampleSurvival(resp, iters=101):
"""Resamples respondents and estimates the survival function.
resp: DataFrame of respondents
iters: number of resamples
"""
_, sf = EstimateSurvival(resp)
thinkplot.Plot(sf)
low, high = resp.agemarry.min(), resp.agemarry.max()
ts = np.arange(low, high, 1/12.0)
ss_seq = []
for _ in range(iters):
sample = thinkstats2.ResampleRowsWeighted(resp)
_, sf = EstimateSurvival(sample)
ss_seq.append(sf.Probs(ts))
low, high = thinkstats2.PercentileRows(ss_seq, [5, 95])
thinkplot.FillBetween(ts, low, high, color='gray', label='90% CI')
thinkplot.Show(
xlabel=unicode('연령(년)','utf-8'),
ylabel=unicode('미혼 확률','utf-8'),
xlim=[12, 46],
ylim=[0, 1],
formats=FORMATS)
ResampleSurvival(resp6, iters=101)
In [10]:
def ReadFemResp2002():
"""Reads respondent data from NSFG Cycle 6.
returns: DataFrame
"""
usecols = ['cmmarrhx', 'cmdivorcx', 'cmbirth', 'cmintvw',
'evrmarry', 'finalwgt']
resp = ReadFemResp(usecols=usecols)
CleanData(resp)
return resp
# read Cycles 5 and 7
resp5 = ReadFemResp1995()
resp6 = ReadFemResp2002()
resp7 = ReadFemResp2010()
# plot resampled survival functions by decade
resps = [resp5, resp6, resp7]
PlotResampledByDecade(resps)
thinkplot.Show(
xlabel=unicode('연령(년)','utf-8'),
ylabel=unicode('미혼 확률','utf-8'),
xlim=[13, 45],
ylim=[0, 1],
formats=FORMATS)
In [9]:
thinkstats2.RandomSeed(17)
# plot resampled survival functions by decade, with predictions
PlotResampledByDecade(resps, predict_flag=True, omit=[5])
thinkplot.Show(
xlabel=unicode('연령(년)','utf-8'),
ylabel=unicode('미혼 확률','utf-8'),
xlim=[13, 45],
ylim=[0, 1],
formats=FORMATS,
legend=True)
In [14]:
preg = nsfg.ReadFemPreg()
sf1 = PlotPregnancyData(preg)
sf2 = PlotMarriageData(resp6)
def PlotRemainingLifetime(sf1, sf2):
"""Plots remaining lifetimes for pregnancy and age at first marriage.
sf1: SurvivalFunction for pregnancy length
sf2: SurvivalFunction for age at first marriage
"""
thinkplot.PrePlot(cols=2)
rem_life1 = sf1.RemainingLifetime()
thinkplot.Plot(rem_life1)
thinkplot.Config(title=unicode('임신기간','utf-8'),
xlabel=unicode('주(weeks)','utf-8'),
ylabel=unicode('평균 잔여 주(weeks)','utf-8'))
thinkplot.SubPlot(2)
func = lambda pmf: pmf.Percentile(50)
rem_life2 = sf2.RemainingLifetime(filler=np.inf, func=func)
thinkplot.Plot(rem_life2)
thinkplot.Config(title=unicode('초혼 연령','utf-8'),
ylim=[0, 15],
xlim=[11, 31],
xlabel=unicode('연령(년)','utf-8'),
ylabel=unicode('중위수 잔존 년','utf-8'))
thinkplot.Show(
formats=FORMATS)
PlotRemainingLifetime(sf1, sf2)
In [18]:
#%run normal.py
def MakeCltPlots():
"""Makes plot showing distributions of sums converging to normal.
"""
thinkplot.PrePlot(num=3, rows=2, cols=3)
samples = MakeExpoSamples()
NormalPlotSamples(samples, plot=1, ylabel=unicode('지수분포값의 합','utf-8'))
thinkplot.PrePlot(num=3)
samples = MakeLognormalSamples()
NormalPlotSamples(samples, plot=4, ylabel=unicode('로그정규분포값의 합','utf-8'))
thinkplot.Show(legend=False)
MakeCltPlots()
In [19]:
def MakeCltPlots():
"""Makes plot showing distributions of sums converging to normal.
"""
thinkplot.PrePlot(num=3, rows=2, cols=3)
samples = MakeParetoSamples()
NormalPlotSamples(samples, plot=1, ylabel=unicode('파레토분포값의 합','utf-8'))
thinkplot.PrePlot(num=3)
samples = MakeCorrelatedSamples()
NormalPlotSamples(samples, plot=4, ylabel=unicode('상관된 지수분포값의 합','utf-8'))
thinkplot.Show(legend=False)
MakeCltPlots()
In [25]:
import first
def TestCorrelation(live):
"""Tests correlation analytically.
live: DataFrame for live births
"""
n, r, cdf = ResampleCorrelations(live)
model = StudentCdf(n)
thinkplot.Plot(model.xs, model.ps, color='gray',
alpha=0.3, label=unicode('스튜던트 t','utf-8'))
thinkplot.Cdf(cdf, label=unicode('표본','utf-8'))
thinkplot.Show(
xlabel=unicode('상관','utf-8'),
ylabel='CDF',
loc='upper left')
live, firsts, others = first.MakeFrames()
TestCorrelation(live)
In [28]:
def TestChiSquared():
"""Performs a chi-squared test analytically.
"""
data = [8, 9, 19, 5, 8, 11]
dt = hypothesis.DiceChiTest(data)
p_value = dt.PValue(iters=1000)
n, chi2, cdf = len(data), dt.actual, dt.test_cdf
model = ChiSquaredCdf(n)
thinkplot.Plot(model.xs, model.ps, color='gray',
alpha=0.3, label=unicode('카이제곱','utf-8'))
thinkplot.Cdf(cdf, label=unicode('표본','utf-8'))
thinkplot.Show(
xlabel=unicode('카이제곱 통계량','utf-8'),
ylabel='CDF',
loc='lower right')
TestChiSquared()
In [30]:
import first
live, firsts, others = first.MakeFrames()
def MakeFigures(live, firsts, others):
"""Creates several figures for the book.
live: DataFrame
firsts: DataFrame
others: DataFrame
"""
first_wgt = firsts.totalwgt_lb
first_wgt_dropna = first_wgt.dropna()
print('Firsts', len(first_wgt), len(first_wgt_dropna))
#assert len(first_wgt_dropna) == 4381
other_wgt = others.totalwgt_lb
other_wgt_dropna = other_wgt.dropna()
print('Others', len(other_wgt), len(other_wgt_dropna))
#assert len(other_wgt_dropna) == 4706
first_pmf = thinkstats2.Pmf(first_wgt_dropna, label='first')
other_pmf = thinkstats2.Pmf(other_wgt_dropna, label='other')
width = 0.4 / 16
# plot PMFs of birth weights for first babies and others
thinkplot.PrePlot(2)
thinkplot.Hist(first_pmf, align='right', width=width)
thinkplot.Hist(other_pmf, align='left', width=width)
thinkplot.Show(
title='Birth weight',
xlabel='weight (pounds)',
ylabel='PMF')
MakeFigures(live, firsts, others):
In [14]:
import first
live, firsts, others = first.MakeFrames()
first_hist = thinkstats2.Pmf(firsts.totalwgt_lb)
others_hist = thinkstats2.Pmf(others.totalwgt_lb)
width = 0.05
thinkplot.PrePlot(2)
thinkplot.Hist(first_hist, align='right', width=width, label=unicode('첫째','utf-8'))
thinkplot.Hist(others_hist, align='left', width=width, label=unicode('첫째아님','utf-8'))
ko_xlab = unicode('체중(파운드)','utf-8')
ko_ylab = unicode('확률','utf-8')
thinkplot.Show(xlabel=ko_xlab, ylabel=ko_ylab, title=unicode('출생체중 PMF','utf-8'))
In [ ]: