Read the BRFSS data and extract heights and log weights.
In [101]:
import brfss
import numpy as np
df = brfss.ReadBrfss(nrows=None)
df = df.dropna(subset=['htm3', 'wtkg2'])
heights, weights = df.htm3, df.wtkg2
weights = np.log10(weights)
Estimate intercept and slope.
In [102]:
import thinkstats2
inter, slope = thinkstats2.LeastSquares(heights, weights)
inter, slope
Out[102]:
Make a scatter plot of the data and show the fitted line.
In [103]:
import thinkplot
thinkplot.Scatter(heights, weights, alpha=0.01)
fxs, fys = thinkstats2.FitLine(heights, inter, slope)
thinkplot.Plot(fxs, fys)
Make the same plot but apply the inverse transform to show weights on a linear (not log) scale.
In [104]:
thinkplot.Scatter(heights, 10**weights, alpha=0.01)
fxs, fys = thinkstats2.FitLine(heights, inter, slope)
thinkplot.Plot(fxs, 10**fys)
Plot percentiles of the residuals.
The lines are flat over most of the range, indicating that the relationship is linear.
The lines are mostly parallel, indicating that the variance of the residuals is the same over the range.
In [105]:
res = thinkstats2.Residuals(heights, weights, inter, slope)
df['residual'] = res
bins = np.arange(130, 210, 5)
indices = np.digitize(df.htm3, bins)
groups = df.groupby(indices)
means = [group.htm3.mean() for i, group in groups][1:-1]
cdfs = [thinkstats2.Cdf(group.residual) for i, group in groups][1:-1]
thinkplot.PrePlot(3)
for percent in [75, 50, 25]:
ys = [cdf.Percentile(percent) for cdf in cdfs]
label = '%dth' % percent
thinkplot.Plot(means, ys, label=label)
Compute correlation.
In [106]:
rho = thinkstats2.Corr(heights, weights)
rho
Out[106]:
Compute coefficient of determination.
In [107]:
r2 = thinkstats2.CoefDetermination(weights, res)
r2
Out[107]:
Confirm that $R^2 = \rho^2$.
In [124]:
rho**2 - r2
Out[124]:
Compute Std(ys), which is the RMSE of predictions that don't use height.
In [109]:
std_ys = thinkstats2.Std(weights)
std_ys
Out[109]:
Compute Std(res), the RMSE of predictions that do use height.
In [110]:
std_res = thinkstats2.Std(res)
std_res
Out[110]:
How much does height information reduce RMSE? About 15%.
In [111]:
1 - std_res / std_ys
Out[111]:
Use resampling to compute sampling distributions for inter and slope.
In [112]:
t = []
for _ in range(100):
sample = thinkstats2.ResampleRows(df)
estimates = thinkstats2.LeastSquares(sample.htm3, np.log10(sample.wtkg2))
t.append(estimates)
inters, slopes = zip(*t)
Plot the sampling distribution of slope.
In [113]:
cdf = thinkstats2.Cdf(slopes)
thinkplot.Cdf(cdf)
Out[113]:
Compute the p-value of the slope.
In [116]:
pvalue = cdf[0]
pvalue
Out[116]:
Compute the 90% confidence interval of slope.
In [117]:
ci = cdf.Percentile(5), cdf.Percentile(95)
ci
Out[117]:
Compute the mean of the sampling distribution.
In [118]:
mean = thinkstats2.Mean(slopes)
mean
Out[118]:
Compute the standard deviation of the sampling distribution, which is the standard error.
In [119]:
stderr = thinkstats2.Std(slopes)
stderr
Out[119]:
Resample using sampling weights.
In [120]:
def ResampleRowsWeighted(df):
"""Resamples a DataFrame using probabilities proportional to finalwt.
live: DataFrame
returns: DataFrame
"""
weights = df.finalwt
cdf = thinkstats2.Pmf(weights.iteritems()).MakeCdf()
indices = cdf.Sample(len(weights))
sample = df.loc[indices]
return sample
Summarize a sampling distribution.
In [121]:
def Summarize(estimates):
mean = thinkstats2.Mean(estimates)
stderr = thinkstats2.Std(estimates)
cdf = thinkstats2.Cdf(estimates)
ci = cdf.Percentile(5), cdf.Percentile(95)
print('mean', mean)
print('stderr', stderr)
print('ci', ci)
Resample rows without weights, and summarize results.
In [123]:
estimates_unweighted = [thinkstats2.ResampleRows(df).htm3.mean() for _ in range(100)]
Summarize(estimates_unweighted)
Resample rows with weights. The estimated mean height is almost 2 cm taller if we take into account the sampling weights, and this difference is much bigger than the sampling error.
In [122]:
estimates_weighted = [ResampleRowsWeighted(df).htm3.mean() for _ in range(100)]
Summarize(estimates_weighted)
In [ ]: