Ported to Python (by Ilan Fridman Rojas) from original R implementation by Rasmus Bååth: http://www.sumsar.net/blog/2015/07/easy-bayesian-bootstrap-in-r/



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline



In [2]:

    
data = pd.read_csv('american_presidents.csv', header=0, index_col=None)
data









    Out[2]:






  
    
      
      order
      president
      height_cm
    
  
  
    
      0
      1
      George Washington
      188.0
    
    
      1
      2
      John Adams
      170.0
    
    
      2
      3
      Thomas Jefferson
      189.0
    
    
      3
      4
      James Madison
      163.0
    
    
      4
      5
      James Monroe
      183.0
    
    
      5
      6
      John Quincy Adams
      171.0
    
    
      6
      7
      Andrew Jackson
      185.0
    
    
      7
      8
      Martin Van Buren
      168.0
    
    
      8
      9
      William Henry Harrison
      173.0
    
    
      9
      10
      John Tyler
      183.0
    
    
      10
      11
      James K. Polk
      173.0
    
    
      11
      12
      Zachary Taylor
      173.0
    
    
      12
      13
      Millard Fillmore
      175.0
    
    
      13
      14
      Franklin Pierce
      178.0
    
    
      14
      15
      James Buchanan
      183.0
    
    
      15
      16
      Abraham Lincoln
      192.4
    
    
      16
      17
      Andrew Johnson
      178.0
    
    
      17
      18
      Ulysses S. Grant
      173.0
    
    
      18
      19
      Rutherford B. Hayes
      174.0
    
    
      19
      20
      James A. Garfield
      183.0
    
    
      20
      21
      Chester A. Arthur
      188.0
    
    
      21
      22
      Grover Cleveland
      180.0
    
    
      22
      23
      Benjamin Harrison
      168.0
    
    
      23
      25
      William McKinley
      170.0
    
    
      24
      26
      Theodore Roosevelt
      178.0
    
    
      25
      27
      William Howard Taft
      182.0
    
    
      26
      28
      Woodrow Wilson
      180.0
    
    
      27
      29
      Warren G. Harding
      183.0
    
    
      28
      30
      Calvin Coolidge
      178.0
    
    
      29
      31
      Herbert Hoover
      182.0
    
    
      30
      32
      Franklin D. Roosevelt
      188.0
    
    
      31
      33
      Harry S. Truman
      175.0
    
    
      32
      34
      Dwight D. Eisenhower
      179.0
    
    
      33
      35
      John F. Kennedy
      183.0
    
    
      34
      36
      Lyndon B. Johnson
      192.0
    
    
      35
      37
      Richard Nixon
      182.0
    
    
      36
      38
      Gerald Ford
      183.0
    
    
      37
      39
      Jimmy Carter
      177.0
    
    
      38
      40
      Ronald Reagan
      185.0
    
    
      39
      41
      George H. W. Bush
      188.0
    
    
      40
      42
      Bill Clinton
      188.0
    
    
      41
      43
      George W. Bush
      182.0
    
    
      42
      44
      Barack Obama
      185.0



In [3]:

    
data.describe()



In [4]:

    
data.plot(x='order',y='height_cm', color='blue')









    Out[4]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f2ffa41d190>



In [5]:

    
data.plot('order', kind='hist', color='blue')









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f2ffa342410>



In [6]:

    
import random
import numpy.random as npr

The standard bootstrap method



In [7]:

    
def bootstrap(data, num_samples, statistic, alpha):
    """Returns the results from num_samples bootstrap samples for an input test statistic and its 100*(1-alpha)% confidence level interval."""
    # Generate the indices for the required number of permutations/(resamplings with replacement) required
    idx = npr.randint(0, len(data), (num_samples, len(data)))
    # Generate the multiple resampled data set from the original one
    samples = data[idx]
    # Apply the 'statistic' function given to each of the data sets produced by the resampling and order the resulting statistic by decreasing size.
    stats = np.sort(statistic(samples, 1))
    stat = stats.mean()
    # Return the value of the computed statistic at the upper and lower percentiles specified by the alpha parameter given. These are, by definition, the boundaries of the Confidence Interval for that value of alpha. E.g. alpha=0.05 ==> CI 95%
    low_ci = stats[int((alpha / 2.0) * num_samples)]
    high_ci = stats[int((1 - alpha / 2.0) * num_samples)]

    #sd = np.std(stat)
    # To include Bessel's correction for unbiased standard deviation:
    sd = np.std(stat, ddof=1)
    # or manually:
    # sd = np.sqrt(len(data) / (len(data) - 1)) * np.std(stats)
    return stat, sd, low_ci, high_ci

The Bayesian bootstrap (with a Dirichlet prior)

(See: http://sumsar.net/blog/2015/04/the-non-parametric-bootstrap-as-a-bayesian-model/ and http://projecteuclid.org/euclid.aos/1176345338 )



In [8]:

    
def bayes_bstrp(data, statistic, nbstrp, samplesize):
    """Implements the Bayesian bootstrap method."""
    
    def Dirichlet_sample(m,n):
        """Returns a matrix of values drawn from a Dirichlet distribution with parameters = 1.
        'm' rows of values, with 'n' Dirichlet draws in each one."""
        # Draw from Gamma distribution
        Dirichlet_params = np.ones(m*n) # Set Dirichlet distribution parameters
        # https://en.wikipedia.org/wiki/Dirichlet_distribution#Gamma_distribution
        Dirichlet_weights = np.asarray([random.gammavariate(a,1) for a in Dirichlet_params])
        Dirichlet_weights = Dirichlet_weights.reshape(m,n) # Fold them (row by row) into a matrix
        row_sums = Dirichlet_weights.sum(axis=1)
        Dirichlet_weights = Dirichlet_weights / row_sums[:, np.newaxis] # Reweight each row to be normalised to 1
        return Dirichlet_weights
    
    Dirich_wgts_matrix = Dirichlet_sample(nbstrp, data.shape[0]) #Generate sample of Dirichlet weights
    
    # If statistic can be directly computed using the weights (such as the mean), do this since it will be faster.
    if statistic==np.mean or statistic==np.average:
        results = np.asarray([np.average(data, weights=Dirich_wgts_matrix[i]) for i in xrange(nbstrp)])
        return results
    
    # Otherwise resort to sampling according to the Dirichlet weights and computing the statistic
    else:
        results = np.zeros(nbstrp)
        for i in xrange(nbstrp): #Sample from data according to Dirichlet weights
            weighted_sample = np.random.choice(data, samplesize, replace=True, p = Dirich_wgts_matrix[i])
            results[i] = statistic(weighted_sample) #Compute the statistic for each sample
        return results

Test both the weighted statistic method and the weighted sampling methods



In [9]:

    
height_data = data['height_cm'].values

posterior_mean = bayes_bstrp(height_data, np.mean, nbstrp=10000, samplesize=1000)
print posterior_mean

posterior_median = bayes_bstrp(height_data, np.median, nbstrp=10000, samplesize=1000)
print posterior_median









    



[ 180.85647367  179.98984525  180.99295606 ...,  180.57419551  179.89039703
  179.12630132]
[ 178.  182.  178. ...,  182.  183.  180.]

Define a function to compute confidence intervals and use it



In [10]:

    
def CI(sample, alpha=0.05):
    """Returns the 100*(1-alpha)% confidence level interval for a test statistic computed on a bootstrap sample."""
    sample.sort()
    num_samples = sample.shape[0]
    low_ci = sample[int((alpha / 2.0) * num_samples)]
    high_ci = sample[int((1 - alpha / 2.0) * num_samples)]
    return [low_ci, high_ci]



In [11]:

    
meanCI = CI(posterior_mean, alpha=0.05)

print "The mean of the posterior is:\t{0:.4g}".format(posterior_mean.mean())
print "With confidence interval:\t[{0:.4g}, {1:.4g}]".format(meanCI[0],meanCI[1])
#print posterior_median.mean(), CI(posterior_median)









    



The mean of the posterior is:	179.8
With confidence interval:	[177.8, 181.8]



In [12]:

    
fig,ax =plt.subplots(2,1, sharex=True)
ax[0].hist(height_data, color='blue')
ax[0].set_xlabel('Heights of American Presidents (in cm)')
ax[0].set_ylabel('Frequency')
ax[1].hist(posterior_mean, color='blue')
ax[1].set_xlabel('Bayesian Bootstrap posterior of the mean (95% CI in red)')
ax[1].set_ylabel('Frequency')
ax[1].plot([meanCI[0], meanCI[1]], [0, 0], 'r', linewidth=8)
plt.show()



In [13]:

    
from scipy import stats

x = data['order'].values
y = data['height_cm'].values

slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print slope
print intercept









    



0.190421199662
175.522165608



In [14]:

    
def bayes_bstrp1(data, statistic, nbstrp, samplesize):
    """Implements the Bayesian bootstrap method."""
    """Input can be a 1D Numpy array, or for test statistics of two variables: a Pandas DataFrame with two columns: x,y"""
    
    def Dirichlet_sample(m,n):
        """Returns a matrix of values drawn from a Dirichlet distribution with parameters = 1.
        'm' rows of values, with 'n' Dirichlet draws in each one."""
        Dirichlet_params = np.ones(m*n) # Set Dirichlet distribution parameters
        # https://en.wikipedia.org/wiki/Dirichlet_distribution#Gamma_distribution
        Dirichlet_weights = np.asarray([random.gammavariate(a,1) for a in Dirichlet_params]) # Draw from Gamma distrib
        Dirichlet_weights = Dirichlet_weights.reshape(m,n) # Fold them (row by row) into a matrix
        row_sums = Dirichlet_weights.sum(axis=1)
        Dirichlet_weights = Dirichlet_weights / row_sums[:, np.newaxis] # Reweight each row to be normalised to 1
        return Dirichlet_weights
    
    Dirich_wgts_matrix = Dirichlet_sample(nbstrp, data.shape[0]) #Generate sample of Dirichlet weights

    if data.ndim==1:
        # If statistic can be directly computed using the weights (such as the mean), do this since it will be faster.
        if statistic==np.mean or statistic==np.average:
            results = np.asarray([np.average(data, weights=Dirich_wgts_matrix[i]) for i in xrange(nbstrp)])
            return results

        # Otherwise resort to sampling according to the Dirichlet weights and computing the statistic
        else:
            results = np.zeros(nbstrp)
            for i in xrange(nbstrp): #Sample from data according to Dirichlet weights
                weighted_sample = np.random.choice(data, samplesize, replace=True, p = Dirich_wgts_matrix[i])
                results[i] = statistic(weighted_sample) #Compute the statistic for each sample
            return results
        
    elif data.ndim>=2:
        # If statistic can be directly computed using the weights (such as the mean), do this since it will be faster.
        if statistic==np.mean or statistic==np.average:
            results = np.asarray([np.average(data[data.columns[1]].values, weights=Dirich_wgts_matrix[i])
                                  for i in xrange(nbstrp)])
            return results

        # Otherwise resort to sampling according to the Dirichlet weights and computing the statistic
        else:
            index_sample=np.zeros((nbstrp,samplesize))
            results = []
            for i in xrange(nbstrp): #Sample from data according to Dirichlet weights
                # Now instead of sampling data points directly, we sample over their index (i.e. by row number)
                #   which is exactly equivalent, but which preserves the x,y pairings during the sampling
                index_sample[i,:] = np.random.choice(np.arange(data.shape[0]), samplesize, replace=True,
                                                     p = Dirich_wgts_matrix[i])
                # We index from the DataFrame this way because Pandas does not support slicing like this
                #    http://stackoverflow.com/questions/23686561/slice-a-pandas-dataframe-by-an-array-of-indices-and-column-names
                results.append(statistic(data.values[index_sample[i].astype(int),0],
                                         data.values[index_sample[i].astype(int),1]))
            return np.array(results)



In [15]:

    
posterior_mean1 = bayes_bstrp1(height_data, np.mean, nbstrp=10000, samplesize=1000)
print posterior_mean1

posterior_median1 = bayes_bstrp(height_data, np.median, nbstrp=10000, samplesize=1000)
print posterior_median1









    



[ 181.11346261  180.77468872  178.24935833 ...,  179.46000732  179.11265521
  177.70297528]
[ 180.  182.  183. ...,  178.  180.  182.]



In [16]:

    
# Copy the columns containing x and y (in that order) into a new Pandas DataFrame, to be used for Bayesian bootstrap
test_df = data[['order','height_cm']]

linregres_posterior = bayes_bstrp1(test_df, stats.linregress, nbstrp=100, samplesize=60)
print linregres_posterior

# These 5 values are:     slope, intercept, R, p_value, std_err









    



[[  2.01722837e-01   1.74321119e+02   3.67077485e-01   3.91402030e-03
    6.71204967e-02]
 [  2.27692509e-01   1.74763637e+02   4.88110673e-01   7.61442295e-05
    5.34591556e-02]
 [  1.80608218e-01   1.77098241e+02   3.18364141e-01   1.31738061e-02
    7.06143997e-02]
 [  2.22044410e-01   1.76341450e+02   3.43056525e-01   7.28849159e-03
    7.98309524e-02]
 [  4.15536778e-02   1.78792609e+02   8.46444311e-02   5.20213622e-01
    6.42296700e-02]
 [  2.42152442e-01   1.75097101e+02   4.62582509e-01   1.98252382e-04
    6.09398997e-02]
 [  2.96015796e-01   1.73909220e+02   5.80211704e-01   1.18062472e-06
    5.45616134e-02]
 [  1.89729051e-01   1.75443151e+02   4.22153872e-01   7.80159543e-04
    5.34968804e-02]
 [  3.26460404e-01   1.71487355e+02   5.45480954e-01   6.57239929e-06
    6.58634989e-02]
 [  1.46836965e-01   1.76659420e+02   3.14389413e-01   1.44289548e-02
    5.82175968e-02]
 [  3.42558177e-01   1.70210003e+02   5.97927395e-01   4.54475086e-07
    6.02979647e-02]
 [  9.99679156e-02   1.77394123e+02   2.01417233e-01   1.22778732e-01
    6.38347130e-02]
 [  1.39887446e-01   1.77266616e+02   3.25790469e-01   1.10791139e-02
    5.33041791e-02]
 [  2.93226019e-02   1.79085506e+02   6.59571802e-02   6.16581869e-01
    5.82478233e-02]
 [  2.36048661e-01   1.73740071e+02   4.32487632e-01   5.58529719e-04
    6.46170248e-02]
 [  3.20622934e-01   1.71318619e+02   4.99630272e-01   4.82044464e-05
    7.29910239e-02]
 [  1.88530910e-01   1.74490445e+02   3.88782025e-01   2.14133623e-03
    5.86647861e-02]
 [  1.38405060e-01   1.75174221e+02   2.90468361e-01   2.43595275e-02
    5.98685446e-02]
 [  3.09630583e-01   1.70981604e+02   5.96407545e-01   4.94376441e-07
    5.47179891e-02]
 [  4.54763199e-02   1.78806172e+02   9.73958800e-02   4.59107138e-01
    6.10184328e-02]
 [  1.23516568e-01   1.74899104e+02   2.38862189e-01   6.60621097e-02
    6.59336259e-02]
 [  1.03567954e-01   1.76988223e+02   1.65827494e-01   2.05419396e-01
    8.08723280e-02]
 [  2.67932543e-01   1.73873100e+02   5.38693472e-01   8.99274217e-06
    5.50225373e-02]
 [  9.30330263e-02   1.79017221e+02   1.76636462e-01   1.76989926e-01
    6.80706284e-02]
 [  1.53174579e-01   1.77920437e+02   2.71009831e-01   3.62203911e-02
    7.14369564e-02]
 [  2.20494454e-01   1.73065742e+02   4.42854895e-01   3.95138131e-04
    5.86162071e-02]
 [  2.77267344e-01   1.73024213e+02   5.11241622e-01   2.98960935e-05
    6.12029819e-02]
 [  2.18938664e-01   1.76126922e+02   4.10478291e-01   1.12391102e-03
    6.38633032e-02]
 [ -7.27171949e-03   1.82794403e+02  -1.79380888e-02   8.91792961e-01
    5.32202692e-02]
 [  1.71652542e-01   1.76680363e+02   3.06809130e-01   1.71089937e-02
    6.99198454e-02]
 [  2.45122546e-01   1.73714699e+02   5.11496728e-01   2.95781545e-05
    5.40709185e-02]
 [  5.55465961e-02   1.77649246e+02   1.15560488e-01   3.79266523e-01
    6.26923741e-02]
 [  3.00037268e-01   1.72852461e+02   5.60913761e-01   3.13914980e-06
    5.81472900e-02]
 [  3.23745056e-02   1.82139955e+02   7.60828867e-02   5.63414876e-01
    5.57110744e-02]
 [  2.03307224e-01   1.75033882e+02   4.13684132e-01   1.01803282e-03
    5.87505540e-02]
 [  8.36325150e-02   1.77085414e+02   1.68069747e-01   1.99271475e-01
    6.44094308e-02]
 [  3.27675744e-01   1.72300224e+02   4.94541623e-01   5.91113477e-05
    7.56178202e-02]
 [  3.41902461e-01   1.70817160e+02   5.80409298e-01   1.16848158e-06
    6.29870960e-02]
 [  1.05987325e-01   1.79033387e+02   1.93636777e-01   1.38227866e-01
    7.05104591e-02]
 [  2.26010444e-01   1.73681214e+02   4.28953717e-01   6.26899033e-04
    6.24955092e-02]
 [  4.57738325e-01   1.67961791e+02   7.53154568e-01   3.86563903e-12
    5.24978860e-02]
 [  3.11802470e-01   1.73280163e+02   4.02221674e-01   1.44369293e-03
    9.31919729e-02]
 [  1.04971241e-01   1.77983909e+02   1.81445498e-01   1.65303818e-01
    7.47034713e-02]
 [  3.27479616e-01   1.70794023e+02   5.82037045e-01   1.07280632e-06
    6.00754273e-02]
 [  2.71533171e-01   1.72530990e+02   4.83057764e-01   9.25795217e-05
    6.46263861e-02]
 [  1.06638256e-01   1.77412659e+02   2.15671108e-01   9.79287562e-02
    6.33963453e-02]
 [  3.38619420e-01   1.73277944e+02   5.05472222e-01   3.79877267e-05
    7.58983420e-02]
 [  1.01907146e-01   1.77607290e+02   2.22430221e-01   8.76057818e-02
    5.86514260e-02]
 [  3.95305122e-01   1.71556824e+02   6.43185107e-01   2.99254921e-08
    6.17941264e-02]
 [  2.63505449e-01   1.72594839e+02   4.31734025e-01   5.72514686e-04
    7.22880195e-02]
 [  1.88780690e-01   1.75954060e+02   3.48533755e-01   6.35102010e-03
    6.66615791e-02]
 [  1.47917334e-01   1.77559081e+02   3.49895973e-01   6.13496355e-03
    5.20005225e-02]
 [  2.35614398e-01   1.74859321e+02   5.04033137e-01   4.02995850e-05
    5.30131697e-02]
 [  1.22041464e-01   1.74521313e+02   2.44063147e-01   6.02083995e-02
    6.36729798e-02]
 [  3.07003680e-01   1.74707852e+02   4.81517213e-01   9.82041955e-05
    7.33733404e-02]
 [  1.57493342e-01   1.77070659e+02   3.13270325e-01   1.48003529e-02
    6.26900932e-02]
 [  4.16439168e-02   1.80621176e+02   9.58937736e-02   4.66095391e-01
    5.67598350e-02]
 [  8.72314533e-02   1.78740091e+02   1.77399434e-01   1.75096940e-01
    6.35423534e-02]
 [  1.81513232e-01   1.75083317e+02   3.57368539e-01   5.06063629e-03
    6.22884980e-02]
 [  1.36924282e-01   1.77090414e+02   2.41422733e-01   6.31255708e-02
    7.22683387e-02]
 [  8.90693973e-02   1.78984183e+02   1.85039404e-01   1.56947134e-01
    6.21133562e-02]
 [  3.61239278e-01   1.71122169e+02   6.14896641e-01   1.72300122e-07
    6.08330886e-02]
 [  1.44309622e-01   1.78631258e+02   2.71500308e-01   3.58712247e-02
    6.71713208e-02]
 [  3.53723589e-02   1.80666892e+02   8.99431761e-02   4.94336262e-01
    5.14301736e-02]
 [  2.14660525e-01   1.76383402e+02   3.24382565e-01   1.14524954e-02
    8.21935689e-02]
 [  1.36767557e-01   1.77204323e+02   3.09807754e-01   1.60018998e-02
    5.51144688e-02]
 [  2.88542863e-01   1.71468253e+02   5.67975515e-01   2.21063101e-06
    5.49023008e-02]
 [  3.45312308e-01   1.71143780e+02   6.89161109e-01   1.14378714e-09
    4.76738815e-02]
 [  9.09447963e-02   1.77064989e+02   1.65753428e-01   2.05624738e-01
    7.10480077e-02]
 [  1.55598281e-01   1.76407629e+02   3.10884297e-01   1.56196894e-02
    6.24626300e-02]
 [  3.22230235e-01   1.70899607e+02   5.94740953e-01   5.41898969e-07
    5.71920783e-02]
 [  1.44438937e-01   1.76965303e+02   3.09100329e-01   1.62573961e-02
    5.83532134e-02]
 [  7.16234011e-02   1.78723851e+02   1.16607366e-01   3.74934423e-01
    8.01017659e-02]
 [  9.07571940e-02   1.78769500e+02   1.96928342e-01   1.31523007e-01
    5.93294146e-02]
 [  2.27567742e-01   1.75165482e+02   4.23828611e-01   7.39558579e-04
    6.38573856e-02]
 [  3.24305491e-01   1.71119076e+02   4.82183922e-01   9.57325165e-05
    7.73688950e-02]
 [ -5.26095875e-02   1.81914370e+02  -9.84285266e-02   4.54336317e-01
    6.98418763e-02]
 [ -1.26555955e-02   1.79980532e+02  -2.93894303e-02   8.23606474e-01
    5.65183899e-02]
 [  2.10184182e-01   1.74399404e+02   3.48314594e-01   6.38639614e-03
    7.42726598e-02]
 [  1.20570912e-01   1.78250888e+02   2.56062794e-01   4.82920658e-02
    5.97662329e-02]
 [  4.69675438e-02   1.77997943e+02   8.48335695e-02   5.19278372e-01
    7.24348742e-02]
 [  3.14896127e-01   1.71932293e+02   5.58652158e-01   3.50630156e-06
    6.13870761e-02]
 [  1.36566427e-01   1.75833023e+02   2.16158272e-01   9.71544417e-02
    8.09967008e-02]
 [  1.34370354e-01   1.76781240e+02   2.94649306e-01   2.22937987e-02
    5.72219514e-02]
 [  1.04006701e-01   1.78726232e+02   2.25376776e-01   8.33822568e-02
    5.90361792e-02]
 [  4.35024800e-01   1.69809564e+02   6.79233818e-01   2.43187105e-09
    6.17206643e-02]
 [  2.52371033e-01   1.74913060e+02   3.77697491e-01   2.92855688e-03
    8.12379805e-02]
 [  3.01689824e-01   1.74136157e+02   5.32752022e-01   1.17675985e-05
    6.29261588e-02]
 [  7.14836013e-02   1.80101709e+02   1.21788678e-01   3.53935265e-01
    7.64963218e-02]
 [  2.22154270e-01   1.76069191e+02   3.97551038e-01   1.65874229e-03
    6.73273848e-02]
 [  1.72087354e-01   1.75774014e+02   3.78220079e-01   2.88633905e-03
    5.53054603e-02]
 [  3.41751509e-01   1.71912628e+02   5.47958738e-01   5.85156527e-06
    6.85042186e-02]
 [  2.11968355e-01   1.74866497e+02   4.30961538e-01   5.87178779e-04
    5.82778218e-02]
 [  3.16859328e-01   1.73600065e+02   5.59661277e-01   3.33778918e-06
    6.16078177e-02]
 [  4.14834094e-01   1.69345994e+02   7.89176569e-01   6.87841193e-14
    4.23908675e-02]
 [ -6.68259623e-02   1.83408906e+02  -1.30431016e-01   3.20557558e-01
    6.66997797e-02]
 [  1.71765022e-01   1.76411679e+02   3.71048662e-01   3.51563461e-03
    5.64449147e-02]
 [  8.99202733e-02   1.79635566e+02   1.68468536e-01   1.98191918e-01
    6.90832363e-02]
 [  2.60460312e-01   1.72566630e+02   6.26107314e-01   8.79213513e-08
    4.25918858e-02]
 [  2.35589991e-01   1.74818884e+02   4.65012582e-01   1.81572945e-04
    5.88939382e-02]]

Below we apply the -Bayesian, but could just as well be classical- bootstrap method to a linear regression by bootstrapping the data.

This is not the only way to apply the bootstrap. One could fix the regressors/covariates $x_i$, as well as the regression coefficients, $\mathbf{\beta}$, of the linear fit to the original dataset, and then bootstrap the residuals $y_i - \hat{y}_i(\mathbf{\beta})$. The former approach is expected to be more conservative (give larger confidence intervals) than the latter, which is more model dependent in that it fixes the coefficients and thereby implicitly assumes the linear model is essentially correct and only the random variation/noise of the data needs to be bootstrapped.

For more on this see the corresponding section in the original work: https://books.google.co.uk/books?id=gLlpIUxRntoC&lpg=PA113&ots=A8xyY7Lcz3&dq=regression%20bootstrap%20data%20or%20residuals&pg=PA113#v=onepage&q=regression%20bootstrap%20data%20or%20residuals&f=false

For a short, concise and clear explanation of the pros and cons of each way of bootstrapping regression models see: http://www.stat.cmu.edu/~cshalizi/uADA/13/lectures/which-bootstrap-when.pdf



In [17]:

    
slopes = linregres_posterior[:,0]
slopemean = slopes.mean()
slopeCI = CI(slopes)
print "The mean slope and its 95% CI are:\t{0:.4g}\t\t[{1:.4g}, {2:.4g}]".format(slopemean,slopeCI[0],slopeCI[1])

intercepts = linregres_posterior[:,1]
interceptmean = intercept.mean()
interceptCI = CI(intercepts)
print "The mean intercept and its 95% CI are:\t{0:.4g}\t\t[{1:.4g}, {2:.4g}]".format(interceptmean,interceptCI[0],
                                                                                     interceptCI[1])









    



The mean slope and its 95% CI are:	0.1923		[-0.01266, 0.4148]
The mean intercept and its 95% CI are:	175.5		[169.8, 182.1]



In [18]:

    
# Plot the data points
plt.scatter(data['order'].values, data['height_cm'].values)

# The linear function we will use to plot fit coefficients
def linfit(x,slope,intercept):
    return slope*x + intercept

x = data['order'].values
y = data['height_cm'].values

# Choose linear regressions for 10 of the bootstrap samples at random and plot them
ids = npr.randint(0, linregres_posterior.shape[0], 10)
otherfits = [linfit(x, linregres_posterior[i,0], linregres_posterior[i,1]) for i in ids]
for i in otherfits:
    plt.plot(x, i, color='#BBBBBB')

# The fit to the original data
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
plt.plot(x, linfit(x, slope, intercept), color='black', linewidth=2)
    
plt.xlim(0,x.max()+1)
plt.show()

From this plot and the confidence interval on the slope we can confidently say that there is no evidence for a correlation between the two variables.



In [19]:

    
from statsmodels.nonparametric.smoothers_lowess import lowess

# for some odd reason this loess function takes the y values as the first argument and x as second
test_df = data[['height_cm', 'order']]

posterior_loess = bayes_bstrp1(test_df, lowess, nbstrp=100, samplesize=60)
print posterior_loess









    



[[[   1.          161.2307358 ]
  [   1.          161.2307358 ]
  [   2.          162.35369256]
  ..., 
  [  36.          184.17097231]
  [  41.          185.58310123]
  [  43.          186.14808695]]

 [[   3.          176.41936072]
  [   4.          176.5736097 ]
  [   5.          176.75774602]
  ..., 
  [  41.          186.72177553]
  [  42.          187.26463385]
  [  42.          187.26463385]]

 [[   1.          185.94217611]
  [   1.          185.94217611]
  [   5.          185.01111205]
  ..., 
  [  41.          186.00813983]
  [  41.          186.00813983]
  [  41.          186.00813983]]

 ..., 
 [[   4.          170.58522278]
  [   4.          170.58522278]
  [   5.          171.18383756]
  ..., 
  [  42.          185.90205238]
  [  42.          185.90205238]
  [  44.          186.77333643]]

 [[   1.          175.28475106]
  [   1.          175.28475106]
  [   1.          175.28475106]
  ..., 
  [  42.          185.44462395]
  [  42.          185.44462395]
  [  42.          185.44462395]]

 [[   1.          173.27495732]
  [   1.          173.27495732]
  [   1.          173.27495732]
  ..., 
  [  43.          184.53737779]
  [  43.          184.53737779]
  [  44.          184.91844895]]]



In [20]:

    
x = data['order'].values
y = data['height_cm'].values

# To see all the loess curves found:
#for i in posterior_loess:
#    plt.plot(i[:,0], i[:,1], color='#BBBBBB')

ids = npr.randint(0, posterior_loess.shape[0], 20)
for i in ids:
    plt.plot(posterior_loess[i,:,0], posterior_loess[i,:,1], color='#BBBBBB')

plt.scatter(x, y)
    
original_loess = lowess(y, x)
plt.plot(original_loess[:,0], original_loess[:,1], color='black', linewidth=2)
    
plt.xlim(0,x.max()+1)
plt.show()

	order	height_cm
count	43.000000	43.00000
mean	22.465116	179.80000
std	12.995143	6.92999
min	1.000000	163.00000
25%	11.500000	174.50000
50%	22.000000	182.00000
75%	33.500000	184.00000
max	44.000000	192.40000

	order	president	height_cm
0	1	George Washington	188.0
1	2	John Adams	170.0
2	3	Thomas Jefferson	189.0
3	4	James Madison	163.0
4	5	James Monroe	183.0
5	6	John Quincy Adams	171.0
6	7	Andrew Jackson	185.0
7	8	Martin Van Buren	168.0
8	9	William Henry Harrison	173.0
9	10	John Tyler	183.0
10	11	James K. Polk	173.0
11	12	Zachary Taylor	173.0
12	13	Millard Fillmore	175.0
13	14	Franklin Pierce	178.0
14	15	James Buchanan	183.0
15	16	Abraham Lincoln	192.4
16	17	Andrew Johnson	178.0
17	18	Ulysses S. Grant	173.0
18	19	Rutherford B. Hayes	174.0
19	20	James A. Garfield	183.0
20	21	Chester A. Arthur	188.0
21	22	Grover Cleveland	180.0
22	23	Benjamin Harrison	168.0
23	25	William McKinley	170.0
24	26	Theodore Roosevelt	178.0
25	27	William Howard Taft	182.0
26	28	Woodrow Wilson	180.0
27	29	Warren G. Harding	183.0
28	30	Calvin Coolidge	178.0
29	31	Herbert Hoover	182.0
30	32	Franklin D. Roosevelt	188.0
31	33	Harry S. Truman	175.0
32	34	Dwight D. Eisenhower	179.0
33	35	John F. Kennedy	183.0
34	36	Lyndon B. Johnson	192.0
35	37	Richard Nixon	182.0
36	38	Gerald Ford	183.0
37	39	Jimmy Carter	177.0
38	40	Ronald Reagan	185.0
39	41	George H. W. Bush	188.0
40	42	Bill Clinton	188.0
41	43	George W. Bush	182.0
42	44	Barack Obama	185.0