notebook.community

Edit and run



In [2]:

    
%matplotlib inline
from __future__ import print_function, division

import math
import numpy as np

import first
import thinkplot
import thinkstats2



In [5]:

    
live, firsts, others = first.MakeFrames()
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
ages = live.agepreg
weights = live.totalwgt_lb

inter, slope = thinkstats2.LeastSquares(ages, weights)
fit_xs, fit_ys = thinkstats2.FitLine(ages, inter, slope)



In [6]:

    
thinkplot.Scatter(ages, weights)
thinkplot.Plot(fit_xs, fit_ys)
thinkplot.Show()









    



/Users/nathankiner/anaconda/lib/python2.7/site-packages/matplotlib/axes/_axes.py:475: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "
/Users/nathankiner/anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):






    












    





<matplotlib.figure.Figure at 0x107941790>



In [7]:

    
def SamplingDistributions(live, iters=101):
    """Estimates sampling distributions by resampling rows.

    live: DataFrame
    iters: number of times to run simulations

    returns: pair of sequences (inters, slopes)
    """
    t = []
    for _ in range(iters):
        sample = thinkstats2.ResampleRows(live)
        ages = sample.agepreg
        weights = sample.totalwgt_lb
        estimates = thinkstats2.LeastSquares(ages, weights)
        t.append(estimates)

    inters, slopes = zip(*t)
    return inters, slopes



In [9]:

    
def Summarize(estimates, actual=None):
    """Prints standard error and 90% confidence interval.

    estimates: sequence of estimates
    actual: float actual value
    """
    mean = thinkstats2.Mean(estimates)
    stderr = thinkstats2.Std(estimates, mu=actual)
    cdf = thinkstats2.Cdf(estimates)
    ci = cdf.ConfidenceInterval(90)
    print('mean, SE, CI', mean, stderr, ci)



In [10]:

    
inters, slopes = SamplingDistributions(live)



In [12]:

    
Summarize(inters)









    



mean, SE, CI 6.82515624551 0.067539983475 (6.7263898699440032, 6.9402087775962231)



In [13]:

    
Summarize(slopes)









    



mean, SE, CI 0.0175760127125 0.0027386846078 (0.012977429207479583, 0.021492935452254554)



In [14]:

    
def PlotConfidenceIntervals(xs, inters, slopes, 
                            res=None, percent=90, **options):
    """Plots the 90% confidence intervals for weights based on ages.

    xs: sequence
    inters: estimated intercepts
    slopes: estimated slopes
    res: residuals
    percent: what percentile range to show
    """
    fys_seq = []
    for inter, slope in zip(inters, slopes):
        fxs, fys = thinkstats2.FitLine(xs, inter, slope)
        if res is not None:
            fys += np.random.permutation(res)
        fys_seq.append(fys)

    p = (100 - percent) / 2
    percents = p, 100 - p
    low, high = thinkstats2.PercentileRows(fys_seq, percents)
    thinkplot.FillBetween(fxs, low, high, **options)



In [26]:

    
# PlotConfidenceIntervals(ages, inters, slopes, percent=10)
PlotConfidenceIntervals(ages, inters, slopes, percent=50, label='50th Percent')
PlotConfidenceIntervals(ages, inters, slopes, percent=90, label='90th Percent')
thinkplot.Scatter(ages, weights, label='data')
thinkplot.Show(xlabel='ages', ylabel='weights', axis=[15, 40, 5, 10])









    












    





<matplotlib.figure.Figure at 0x10aa134d0>



In [31]:

    
class SlopeTest(thinkstats2.HypothesisTest):
    
    def TestStatistic(self, data):
        ages, weights = data
        _, slope = thinkstats2.LeastSquares(ages, weights)
        return slope
    
    def MakeModel(self):
        _, weights = self.data
        self.ybar = weights.mean()
        self.res = weights - self.ybar
        
    def RunModel(self):
        ages, _ = self.data
        weights = self.ybar + np.random.permutation(self.res)
        return ages, weights



In [32]:

    
ht = SlopeTest((live.agepreg, live.totalwgt_lb))
ht.PValue()









    Out[32]:





0.0



In [33]:

    
import brfss
df = brfss.ReadBrfss()



In [40]:

    
df = df.dropna(subset=['wtkg2', 'htm3'])
df.describe()









    Out[40]:






  
    
      
      age
      sex
      wtyrago
      finalwt
      wtkg2
      htm3
    
  
  
    
      count
      393518.000000
      395832.000000
      388137.000000
      395832.000000
      395832.000000
      395832.000000
    
    
      mean
      54.891207
      1.612730
      79.766921
      562.527274
      79.044413
      168.956188
    
    
      std
      16.742237
      0.487127
      20.574880
      1076.206594
      19.547890
      10.390752
    
    
      min
      18.000000
      1.000000
      22.727273
      1.695143
      20.000000
      61.000000
    
    
      25%
      43.000000
      1.000000
      64.545455
      97.241254
      64.550000
      163.000000
    
    
      50%
      55.000000
      2.000000
      77.272727
      234.914579
      77.270000
      168.000000
    
    
      75%
      67.000000
      2.000000
      90.909091
      593.585278
      90.910000
      178.000000
    
    
      max
      99.000000
      2.000000
      342.272727
      60995.111700
      300.000000
      236.000000



In [41]:

    
heights = df.htm3
log_weights = np.log(df.wtkg2)
inter, slope = thinkstats2.LeastSquares(heights, log_weights)
fit_xs, fit_ys = thinkstats2.FitLine(heights, inter, slope)



In [43]:

    
thinkplot.Scatter(heights, log_weights)
thinkplot.Plot(fit_xs, fit_ys)
thinkplot.Show()









    












    





<matplotlib.figure.Figure at 0x10aa4c690>



In [39]:

    
fit_xs, fit_ys









    Out[39]:





(array([ 61.,  61.,  61., ...,  nan,  nan,  nan]),
 array([ nan,  nan,  nan, ...,  nan,  nan,  nan]))



In [44]:

    
weights = df.wtkg2
thinkplot.Scatter(heights, weights)
thinkplot.Plot(fit_xs, np.exp(fit_ys))
thinkplot.Show(yscale='log')









    












    





<matplotlib.figure.Figure at 0x10a28f910>



In [45]:

    
def SamplingDistributions(live, iters=101):
    """Estimates sampling distributions by resampling rows.

    live: DataFrame
    iters: number of times to run simulations

    returns: pair of sequences (inters, slopes)
    """
    t = []
    for _ in range(iters):
        sample = thinkstats2.ResampleRows(live)
        heights = sample.htm3
        log_weights = np.log(sample.wtkg2)
        estimates = thinkstats2.LeastSquares(heights, log_weights)
        t.append(estimates)

    inters, slopes = zip(*t)
    return inters, slopes



In [46]:

    
i, s = SamplingDistributions(df)



In [47]:

    
def PlotExpConfidenceIntervals(xs, inters, slopes, 
                            res=None, percent=90, **options):
    """Plots the 90% confidence intervals for weights based on ages.

    xs: sequence
    inters: estimated intercepts
    slopes: estimated slopes
    res: residuals
    percent: what percentile range to show
    """
    fys_seq = []
    for inter, slope in zip(inters, slopes):
        fxs, fys = thinkstats2.FitLine(xs, inter, slope)
        if res is not None:
            fys += np.random.permutation(res)
        fys_seq.append(fys)

    p = (100 - percent) / 2
    percents = p, 100 - p
    low, high = thinkstats2.PercentileRows(fys_seq, percents)
    thinkplot.FillBetween(fxs, np.exp(low), np.exp(high), **options)



In [48]:

    
res = thinkstats2.Residuals(heights, log_weights, inter, slope)
r_2 = thinkstats2.CoefDetermination(log_weights, res)
r_2









    Out[48]:





0.28273494311901137



In [50]:

    
PlotExpConfidenceIntervals(heights, i, s)
thinkplot.Scatter(heights, weights)
thinkplot.Show(yscale='log', axis=[100, 220, 50, 150])









    












    





<matplotlib.figure.Figure at 0x10f63fed0>



In [52]:

    
estimates = [thinkstats2.ResampleRows(df).htm3.mean() for _ in range(100)]
Summarize(estimates)









    



mean, SE, CI 168.95427689 0.0147272666203 (168.9313572424665, 168.97788455708482)



In [55]:

    
estimates = [thinkstats2.ResampleRowsWeighted(df, 'finalwt').htm3.mean() for _ in range(100)]
Summarize(estimates)









    



mean, SE, CI 170.494223812 0.0174021186728 (170.46406556316822, 170.5239167121405)



In [56]:

    
s_cdf = thinkstats2.Cdf(slopes)
thinkplot.Cdf(s_cdf)









    Out[56]:





{'xscale': 'linear', 'yscale': 'linear'}



In [54]:

    
def ResampleRowsWeighted(df, attr='finalwgt'):
    """Resamples a DataFrame using probabilities proportional to finalwgt.

    df: DataFrame
    attr: string column name to use as weights

    returns: DataFrame
    """
    weights = df[attr]
    cdf = thinkstats2.Pmf(weights).MakeCdf()
    indices = cdf.Sample(len(weights))
    sample = df.loc[indices]
    return sample



In [ ]:

	age	sex	wtyrago	finalwt	wtkg2	htm3
count	393518.000000	395832.000000	388137.000000	395832.000000	395832.000000	395832.000000
mean	54.891207	1.612730	79.766921	562.527274	79.044413	168.956188
std	16.742237	0.487127	20.574880	1076.206594	19.547890	10.390752
min	18.000000	1.000000	22.727273	1.695143	20.000000	61.000000
25%	43.000000	1.000000	64.545455	97.241254	64.550000	163.000000
50%	55.000000	2.000000	77.272727	234.914579	77.270000	168.000000
75%	67.000000	2.000000	90.909091	593.585278	90.910000	178.000000
max	99.000000	2.000000	342.272727	60995.111700	300.000000	236.000000