Ported to Python (by Ilan Fridman Rojas) from original R implementation by Rasmus Bååth: http://www.sumsar.net/blog/2015/07/easy-bayesian-bootstrap-in-r/


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [2]:
data = pd.read_csv('american_presidents.csv', header=0, index_col=None)
data


Out[2]:
order president height_cm
0 1 George Washington 188.0
1 2 John Adams 170.0
2 3 Thomas Jefferson 189.0
3 4 James Madison 163.0
4 5 James Monroe 183.0
5 6 John Quincy Adams 171.0
6 7 Andrew Jackson 185.0
7 8 Martin Van Buren 168.0
8 9 William Henry Harrison 173.0
9 10 John Tyler 183.0
10 11 James K. Polk 173.0
11 12 Zachary Taylor 173.0
12 13 Millard Fillmore 175.0
13 14 Franklin Pierce 178.0
14 15 James Buchanan 183.0
15 16 Abraham Lincoln 192.4
16 17 Andrew Johnson 178.0
17 18 Ulysses S. Grant 173.0
18 19 Rutherford B. Hayes 174.0
19 20 James A. Garfield 183.0
20 21 Chester A. Arthur 188.0
21 22 Grover Cleveland 180.0
22 23 Benjamin Harrison 168.0
23 25 William McKinley 170.0
24 26 Theodore Roosevelt 178.0
25 27 William Howard Taft 182.0
26 28 Woodrow Wilson 180.0
27 29 Warren G. Harding 183.0
28 30 Calvin Coolidge 178.0
29 31 Herbert Hoover 182.0
30 32 Franklin D. Roosevelt 188.0
31 33 Harry S. Truman 175.0
32 34 Dwight D. Eisenhower 179.0
33 35 John F. Kennedy 183.0
34 36 Lyndon B. Johnson 192.0
35 37 Richard Nixon 182.0
36 38 Gerald Ford 183.0
37 39 Jimmy Carter 177.0
38 40 Ronald Reagan 185.0
39 41 George H. W. Bush 188.0
40 42 Bill Clinton 188.0
41 43 George W. Bush 182.0
42 44 Barack Obama 185.0

In [3]:
data.describe()


Out[3]:
order height_cm
count 43.000000 43.00000
mean 22.465116 179.80000
std 12.995143 6.92999
min 1.000000 163.00000
25% 11.500000 174.50000
50% 22.000000 182.00000
75% 33.500000 184.00000
max 44.000000 192.40000

In [4]:
data.plot(x='order',y='height_cm', color='blue')


Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2ffa41d190>

In [5]:
data.plot('order', kind='hist', color='blue')


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2ffa342410>

In [6]:
import random
import numpy.random as npr

The standard bootstrap method


In [7]:
def bootstrap(data, num_samples, statistic, alpha):
    """Returns the results from num_samples bootstrap samples for an input test statistic and its 100*(1-alpha)% confidence level interval."""
    # Generate the indices for the required number of permutations/(resamplings with replacement) required
    idx = npr.randint(0, len(data), (num_samples, len(data)))
    # Generate the multiple resampled data set from the original one
    samples = data[idx]
    # Apply the 'statistic' function given to each of the data sets produced by the resampling and order the resulting statistic by decreasing size.
    stats = np.sort(statistic(samples, 1))
    stat = stats.mean()
    # Return the value of the computed statistic at the upper and lower percentiles specified by the alpha parameter given. These are, by definition, the boundaries of the Confidence Interval for that value of alpha. E.g. alpha=0.05 ==> CI 95%
    low_ci = stats[int((alpha / 2.0) * num_samples)]
    high_ci = stats[int((1 - alpha / 2.0) * num_samples)]

    #sd = np.std(stat)
    # To include Bessel's correction for unbiased standard deviation:
    sd = np.std(stat, ddof=1)
    # or manually:
    # sd = np.sqrt(len(data) / (len(data) - 1)) * np.std(stats)
    return stat, sd, low_ci, high_ci

In [8]:
def bayes_bstrp(data, statistic, nbstrp, samplesize):
    """Implements the Bayesian bootstrap method."""
    
    def Dirichlet_sample(m,n):
        """Returns a matrix of values drawn from a Dirichlet distribution with parameters = 1.
        'm' rows of values, with 'n' Dirichlet draws in each one."""
        # Draw from Gamma distribution
        Dirichlet_params = np.ones(m*n) # Set Dirichlet distribution parameters
        # https://en.wikipedia.org/wiki/Dirichlet_distribution#Gamma_distribution
        Dirichlet_weights = np.asarray([random.gammavariate(a,1) for a in Dirichlet_params])
        Dirichlet_weights = Dirichlet_weights.reshape(m,n) # Fold them (row by row) into a matrix
        row_sums = Dirichlet_weights.sum(axis=1)
        Dirichlet_weights = Dirichlet_weights / row_sums[:, np.newaxis] # Reweight each row to be normalised to 1
        return Dirichlet_weights
    
    Dirich_wgts_matrix = Dirichlet_sample(nbstrp, data.shape[0]) #Generate sample of Dirichlet weights
    
    # If statistic can be directly computed using the weights (such as the mean), do this since it will be faster.
    if statistic==np.mean or statistic==np.average:
        results = np.asarray([np.average(data, weights=Dirich_wgts_matrix[i]) for i in xrange(nbstrp)])
        return results
    
    # Otherwise resort to sampling according to the Dirichlet weights and computing the statistic
    else:
        results = np.zeros(nbstrp)
        for i in xrange(nbstrp): #Sample from data according to Dirichlet weights
            weighted_sample = np.random.choice(data, samplesize, replace=True, p = Dirich_wgts_matrix[i])
            results[i] = statistic(weighted_sample) #Compute the statistic for each sample
        return results

Test both the weighted statistic method and the weighted sampling methods


In [9]:
height_data = data['height_cm'].values

posterior_mean = bayes_bstrp(height_data, np.mean, nbstrp=10000, samplesize=1000)
print posterior_mean

posterior_median = bayes_bstrp(height_data, np.median, nbstrp=10000, samplesize=1000)
print posterior_median


[ 180.85647367  179.98984525  180.99295606 ...,  180.57419551  179.89039703
  179.12630132]
[ 178.  182.  178. ...,  182.  183.  180.]

Define a function to compute confidence intervals and use it


In [10]:
def CI(sample, alpha=0.05):
    """Returns the 100*(1-alpha)% confidence level interval for a test statistic computed on a bootstrap sample."""
    sample.sort()
    num_samples = sample.shape[0]
    low_ci = sample[int((alpha / 2.0) * num_samples)]
    high_ci = sample[int((1 - alpha / 2.0) * num_samples)]
    return [low_ci, high_ci]

In [11]:
meanCI = CI(posterior_mean, alpha=0.05)

print "The mean of the posterior is:\t{0:.4g}".format(posterior_mean.mean())
print "With confidence interval:\t[{0:.4g}, {1:.4g}]".format(meanCI[0],meanCI[1])
#print posterior_median.mean(), CI(posterior_median)


The mean of the posterior is:	179.8
With confidence interval:	[177.8, 181.8]

In [12]:
fig,ax =plt.subplots(2,1, sharex=True)
ax[0].hist(height_data, color='blue')
ax[0].set_xlabel('Heights of American Presidents (in cm)')
ax[0].set_ylabel('Frequency')
ax[1].hist(posterior_mean, color='blue')
ax[1].set_xlabel('Bayesian Bootstrap posterior of the mean (95% CI in red)')
ax[1].set_ylabel('Frequency')
ax[1].plot([meanCI[0], meanCI[1]], [0, 0], 'r', linewidth=8)
plt.show()



In [13]:
from scipy import stats

x = data['order'].values
y = data['height_cm'].values

slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print slope
print intercept


0.190421199662
175.522165608

In [14]:
def bayes_bstrp1(data, statistic, nbstrp, samplesize):
    """Implements the Bayesian bootstrap method."""
    """Input can be a 1D Numpy array, or for test statistics of two variables: a Pandas DataFrame with two columns: x,y"""
    
    def Dirichlet_sample(m,n):
        """Returns a matrix of values drawn from a Dirichlet distribution with parameters = 1.
        'm' rows of values, with 'n' Dirichlet draws in each one."""
        Dirichlet_params = np.ones(m*n) # Set Dirichlet distribution parameters
        # https://en.wikipedia.org/wiki/Dirichlet_distribution#Gamma_distribution
        Dirichlet_weights = np.asarray([random.gammavariate(a,1) for a in Dirichlet_params]) # Draw from Gamma distrib
        Dirichlet_weights = Dirichlet_weights.reshape(m,n) # Fold them (row by row) into a matrix
        row_sums = Dirichlet_weights.sum(axis=1)
        Dirichlet_weights = Dirichlet_weights / row_sums[:, np.newaxis] # Reweight each row to be normalised to 1
        return Dirichlet_weights
    
    Dirich_wgts_matrix = Dirichlet_sample(nbstrp, data.shape[0]) #Generate sample of Dirichlet weights

    if data.ndim==1:
        # If statistic can be directly computed using the weights (such as the mean), do this since it will be faster.
        if statistic==np.mean or statistic==np.average:
            results = np.asarray([np.average(data, weights=Dirich_wgts_matrix[i]) for i in xrange(nbstrp)])
            return results

        # Otherwise resort to sampling according to the Dirichlet weights and computing the statistic
        else:
            results = np.zeros(nbstrp)
            for i in xrange(nbstrp): #Sample from data according to Dirichlet weights
                weighted_sample = np.random.choice(data, samplesize, replace=True, p = Dirich_wgts_matrix[i])
                results[i] = statistic(weighted_sample) #Compute the statistic for each sample
            return results
        
    elif data.ndim>=2:
        # If statistic can be directly computed using the weights (such as the mean), do this since it will be faster.
        if statistic==np.mean or statistic==np.average:
            results = np.asarray([np.average(data[data.columns[1]].values, weights=Dirich_wgts_matrix[i])
                                  for i in xrange(nbstrp)])
            return results

        # Otherwise resort to sampling according to the Dirichlet weights and computing the statistic
        else:
            index_sample=np.zeros((nbstrp,samplesize))
            results = []
            for i in xrange(nbstrp): #Sample from data according to Dirichlet weights
                # Now instead of sampling data points directly, we sample over their index (i.e. by row number)
                #   which is exactly equivalent, but which preserves the x,y pairings during the sampling
                index_sample[i,:] = np.random.choice(np.arange(data.shape[0]), samplesize, replace=True,
                                                     p = Dirich_wgts_matrix[i])
                # We index from the DataFrame this way because Pandas does not support slicing like this
                #    http://stackoverflow.com/questions/23686561/slice-a-pandas-dataframe-by-an-array-of-indices-and-column-names
                results.append(statistic(data.values[index_sample[i].astype(int),0],
                                         data.values[index_sample[i].astype(int),1]))
            return np.array(results)

In [15]:
posterior_mean1 = bayes_bstrp1(height_data, np.mean, nbstrp=10000, samplesize=1000)
print posterior_mean1

posterior_median1 = bayes_bstrp(height_data, np.median, nbstrp=10000, samplesize=1000)
print posterior_median1


[ 181.11346261  180.77468872  178.24935833 ...,  179.46000732  179.11265521
  177.70297528]
[ 180.  182.  183. ...,  178.  180.  182.]

In [16]:
# Copy the columns containing x and y (in that order) into a new Pandas DataFrame, to be used for Bayesian bootstrap
test_df = data[['order','height_cm']]

linregres_posterior = bayes_bstrp1(test_df, stats.linregress, nbstrp=100, samplesize=60)
print linregres_posterior

# These 5 values are:     slope, intercept, R, p_value, std_err


[[  2.01722837e-01   1.74321119e+02   3.67077485e-01   3.91402030e-03
    6.71204967e-02]
 [  2.27692509e-01   1.74763637e+02   4.88110673e-01   7.61442295e-05
    5.34591556e-02]
 [  1.80608218e-01   1.77098241e+02   3.18364141e-01   1.31738061e-02
    7.06143997e-02]
 [  2.22044410e-01   1.76341450e+02   3.43056525e-01   7.28849159e-03
    7.98309524e-02]
 [  4.15536778e-02   1.78792609e+02   8.46444311e-02   5.20213622e-01
    6.42296700e-02]
 [  2.42152442e-01   1.75097101e+02   4.62582509e-01   1.98252382e-04
    6.09398997e-02]
 [  2.96015796e-01   1.73909220e+02   5.80211704e-01   1.18062472e-06
    5.45616134e-02]
 [  1.89729051e-01   1.75443151e+02   4.22153872e-01   7.80159543e-04
    5.34968804e-02]
 [  3.26460404e-01   1.71487355e+02   5.45480954e-01   6.57239929e-06
    6.58634989e-02]
 [  1.46836965e-01   1.76659420e+02   3.14389413e-01   1.44289548e-02
    5.82175968e-02]
 [  3.42558177e-01   1.70210003e+02   5.97927395e-01   4.54475086e-07
    6.02979647e-02]
 [  9.99679156e-02   1.77394123e+02   2.01417233e-01   1.22778732e-01
    6.38347130e-02]
 [  1.39887446e-01   1.77266616e+02   3.25790469e-01   1.10791139e-02
    5.33041791e-02]
 [  2.93226019e-02   1.79085506e+02   6.59571802e-02   6.16581869e-01
    5.82478233e-02]
 [  2.36048661e-01   1.73740071e+02   4.32487632e-01   5.58529719e-04
    6.46170248e-02]
 [  3.20622934e-01   1.71318619e+02   4.99630272e-01   4.82044464e-05
    7.29910239e-02]
 [  1.88530910e-01   1.74490445e+02   3.88782025e-01   2.14133623e-03
    5.86647861e-02]
 [  1.38405060e-01   1.75174221e+02   2.90468361e-01   2.43595275e-02
    5.98685446e-02]
 [  3.09630583e-01   1.70981604e+02   5.96407545e-01   4.94376441e-07
    5.47179891e-02]
 [  4.54763199e-02   1.78806172e+02   9.73958800e-02   4.59107138e-01
    6.10184328e-02]
 [  1.23516568e-01   1.74899104e+02   2.38862189e-01   6.60621097e-02
    6.59336259e-02]
 [  1.03567954e-01   1.76988223e+02   1.65827494e-01   2.05419396e-01
    8.08723280e-02]
 [  2.67932543e-01   1.73873100e+02   5.38693472e-01   8.99274217e-06
    5.50225373e-02]
 [  9.30330263e-02   1.79017221e+02   1.76636462e-01   1.76989926e-01
    6.80706284e-02]
 [  1.53174579e-01   1.77920437e+02   2.71009831e-01   3.62203911e-02
    7.14369564e-02]
 [  2.20494454e-01   1.73065742e+02   4.42854895e-01   3.95138131e-04
    5.86162071e-02]
 [  2.77267344e-01   1.73024213e+02   5.11241622e-01   2.98960935e-05
    6.12029819e-02]
 [  2.18938664e-01   1.76126922e+02   4.10478291e-01   1.12391102e-03
    6.38633032e-02]
 [ -7.27171949e-03   1.82794403e+02  -1.79380888e-02   8.91792961e-01
    5.32202692e-02]
 [  1.71652542e-01   1.76680363e+02   3.06809130e-01   1.71089937e-02
    6.99198454e-02]
 [  2.45122546e-01   1.73714699e+02   5.11496728e-01   2.95781545e-05
    5.40709185e-02]
 [  5.55465961e-02   1.77649246e+02   1.15560488e-01   3.79266523e-01
    6.26923741e-02]
 [  3.00037268e-01   1.72852461e+02   5.60913761e-01   3.13914980e-06
    5.81472900e-02]
 [  3.23745056e-02   1.82139955e+02   7.60828867e-02   5.63414876e-01
    5.57110744e-02]
 [  2.03307224e-01   1.75033882e+02   4.13684132e-01   1.01803282e-03
    5.87505540e-02]
 [  8.36325150e-02   1.77085414e+02   1.68069747e-01   1.99271475e-01
    6.44094308e-02]
 [  3.27675744e-01   1.72300224e+02   4.94541623e-01   5.91113477e-05
    7.56178202e-02]
 [  3.41902461e-01   1.70817160e+02   5.80409298e-01   1.16848158e-06
    6.29870960e-02]
 [  1.05987325e-01   1.79033387e+02   1.93636777e-01   1.38227866e-01
    7.05104591e-02]
 [  2.26010444e-01   1.73681214e+02   4.28953717e-01   6.26899033e-04
    6.24955092e-02]
 [  4.57738325e-01   1.67961791e+02   7.53154568e-01   3.86563903e-12
    5.24978860e-02]
 [  3.11802470e-01   1.73280163e+02   4.02221674e-01   1.44369293e-03
    9.31919729e-02]
 [  1.04971241e-01   1.77983909e+02   1.81445498e-01   1.65303818e-01
    7.47034713e-02]
 [  3.27479616e-01   1.70794023e+02   5.82037045e-01   1.07280632e-06
    6.00754273e-02]
 [  2.71533171e-01   1.72530990e+02   4.83057764e-01   9.25795217e-05
    6.46263861e-02]
 [  1.06638256e-01   1.77412659e+02   2.15671108e-01   9.79287562e-02
    6.33963453e-02]
 [  3.38619420e-01   1.73277944e+02   5.05472222e-01   3.79877267e-05
    7.58983420e-02]
 [  1.01907146e-01   1.77607290e+02   2.22430221e-01   8.76057818e-02
    5.86514260e-02]
 [  3.95305122e-01   1.71556824e+02   6.43185107e-01   2.99254921e-08
    6.17941264e-02]
 [  2.63505449e-01   1.72594839e+02   4.31734025e-01   5.72514686e-04
    7.22880195e-02]
 [  1.88780690e-01   1.75954060e+02   3.48533755e-01   6.35102010e-03
    6.66615791e-02]
 [  1.47917334e-01   1.77559081e+02   3.49895973e-01   6.13496355e-03
    5.20005225e-02]
 [  2.35614398e-01   1.74859321e+02   5.04033137e-01   4.02995850e-05
    5.30131697e-02]
 [  1.22041464e-01   1.74521313e+02   2.44063147e-01   6.02083995e-02
    6.36729798e-02]
 [  3.07003680e-01   1.74707852e+02   4.81517213e-01   9.82041955e-05
    7.33733404e-02]
 [  1.57493342e-01   1.77070659e+02   3.13270325e-01   1.48003529e-02
    6.26900932e-02]
 [  4.16439168e-02   1.80621176e+02   9.58937736e-02   4.66095391e-01
    5.67598350e-02]
 [  8.72314533e-02   1.78740091e+02   1.77399434e-01   1.75096940e-01
    6.35423534e-02]
 [  1.81513232e-01   1.75083317e+02   3.57368539e-01   5.06063629e-03
    6.22884980e-02]
 [  1.36924282e-01   1.77090414e+02   2.41422733e-01   6.31255708e-02
    7.22683387e-02]
 [  8.90693973e-02   1.78984183e+02   1.85039404e-01   1.56947134e-01
    6.21133562e-02]
 [  3.61239278e-01   1.71122169e+02   6.14896641e-01   1.72300122e-07
    6.08330886e-02]
 [  1.44309622e-01   1.78631258e+02   2.71500308e-01   3.58712247e-02
    6.71713208e-02]
 [  3.53723589e-02   1.80666892e+02   8.99431761e-02   4.94336262e-01
    5.14301736e-02]
 [  2.14660525e-01   1.76383402e+02   3.24382565e-01   1.14524954e-02
    8.21935689e-02]
 [  1.36767557e-01   1.77204323e+02   3.09807754e-01   1.60018998e-02
    5.51144688e-02]
 [  2.88542863e-01   1.71468253e+02   5.67975515e-01   2.21063101e-06
    5.49023008e-02]
 [  3.45312308e-01   1.71143780e+02   6.89161109e-01   1.14378714e-09
    4.76738815e-02]
 [  9.09447963e-02   1.77064989e+02   1.65753428e-01   2.05624738e-01
    7.10480077e-02]
 [  1.55598281e-01   1.76407629e+02   3.10884297e-01   1.56196894e-02
    6.24626300e-02]
 [  3.22230235e-01   1.70899607e+02   5.94740953e-01   5.41898969e-07
    5.71920783e-02]
 [  1.44438937e-01   1.76965303e+02   3.09100329e-01   1.62573961e-02
    5.83532134e-02]
 [  7.16234011e-02   1.78723851e+02   1.16607366e-01   3.74934423e-01
    8.01017659e-02]
 [  9.07571940e-02   1.78769500e+02   1.96928342e-01   1.31523007e-01
    5.93294146e-02]
 [  2.27567742e-01   1.75165482e+02   4.23828611e-01   7.39558579e-04
    6.38573856e-02]
 [  3.24305491e-01   1.71119076e+02   4.82183922e-01   9.57325165e-05
    7.73688950e-02]
 [ -5.26095875e-02   1.81914370e+02  -9.84285266e-02   4.54336317e-01
    6.98418763e-02]
 [ -1.26555955e-02   1.79980532e+02  -2.93894303e-02   8.23606474e-01
    5.65183899e-02]
 [  2.10184182e-01   1.74399404e+02   3.48314594e-01   6.38639614e-03
    7.42726598e-02]
 [  1.20570912e-01   1.78250888e+02   2.56062794e-01   4.82920658e-02
    5.97662329e-02]
 [  4.69675438e-02   1.77997943e+02   8.48335695e-02   5.19278372e-01
    7.24348742e-02]
 [  3.14896127e-01   1.71932293e+02   5.58652158e-01   3.50630156e-06
    6.13870761e-02]
 [  1.36566427e-01   1.75833023e+02   2.16158272e-01   9.71544417e-02
    8.09967008e-02]
 [  1.34370354e-01   1.76781240e+02   2.94649306e-01   2.22937987e-02
    5.72219514e-02]
 [  1.04006701e-01   1.78726232e+02   2.25376776e-01   8.33822568e-02
    5.90361792e-02]
 [  4.35024800e-01   1.69809564e+02   6.79233818e-01   2.43187105e-09
    6.17206643e-02]
 [  2.52371033e-01   1.74913060e+02   3.77697491e-01   2.92855688e-03
    8.12379805e-02]
 [  3.01689824e-01   1.74136157e+02   5.32752022e-01   1.17675985e-05
    6.29261588e-02]
 [  7.14836013e-02   1.80101709e+02   1.21788678e-01   3.53935265e-01
    7.64963218e-02]
 [  2.22154270e-01   1.76069191e+02   3.97551038e-01   1.65874229e-03
    6.73273848e-02]
 [  1.72087354e-01   1.75774014e+02   3.78220079e-01   2.88633905e-03
    5.53054603e-02]
 [  3.41751509e-01   1.71912628e+02   5.47958738e-01   5.85156527e-06
    6.85042186e-02]
 [  2.11968355e-01   1.74866497e+02   4.30961538e-01   5.87178779e-04
    5.82778218e-02]
 [  3.16859328e-01   1.73600065e+02   5.59661277e-01   3.33778918e-06
    6.16078177e-02]
 [  4.14834094e-01   1.69345994e+02   7.89176569e-01   6.87841193e-14
    4.23908675e-02]
 [ -6.68259623e-02   1.83408906e+02  -1.30431016e-01   3.20557558e-01
    6.66997797e-02]
 [  1.71765022e-01   1.76411679e+02   3.71048662e-01   3.51563461e-03
    5.64449147e-02]
 [  8.99202733e-02   1.79635566e+02   1.68468536e-01   1.98191918e-01
    6.90832363e-02]
 [  2.60460312e-01   1.72566630e+02   6.26107314e-01   8.79213513e-08
    4.25918858e-02]
 [  2.35589991e-01   1.74818884e+02   4.65012582e-01   1.81572945e-04
    5.88939382e-02]]

Below we apply the -Bayesian, but could just as well be classical- bootstrap method to a linear regression by bootstrapping the data.

This is not the only way to apply the bootstrap. One could fix the regressors/covariates $x_i$, as well as the regression coefficients, $\mathbf{\beta}$, of the linear fit to the original dataset, and then bootstrap the residuals $y_i - \hat{y}_i(\mathbf{\beta})$. The former approach is expected to be more conservative (give larger confidence intervals) than the latter, which is more model dependent in that it fixes the coefficients and thereby implicitly assumes the linear model is essentially correct and only the random variation/noise of the data needs to be bootstrapped.

For more on this see the corresponding section in the original work: https://books.google.co.uk/books?id=gLlpIUxRntoC&lpg=PA113&ots=A8xyY7Lcz3&dq=regression%20bootstrap%20data%20or%20residuals&pg=PA113#v=onepage&q=regression%20bootstrap%20data%20or%20residuals&f=false

For a short, concise and clear explanation of the pros and cons of each way of bootstrapping regression models see: http://www.stat.cmu.edu/~cshalizi/uADA/13/lectures/which-bootstrap-when.pdf


In [17]:
slopes = linregres_posterior[:,0]
slopemean = slopes.mean()
slopeCI = CI(slopes)
print "The mean slope and its 95% CI are:\t{0:.4g}\t\t[{1:.4g}, {2:.4g}]".format(slopemean,slopeCI[0],slopeCI[1])

intercepts = linregres_posterior[:,1]
interceptmean = intercept.mean()
interceptCI = CI(intercepts)
print "The mean intercept and its 95% CI are:\t{0:.4g}\t\t[{1:.4g}, {2:.4g}]".format(interceptmean,interceptCI[0],
                                                                                     interceptCI[1])


The mean slope and its 95% CI are:	0.1923		[-0.01266, 0.4148]
The mean intercept and its 95% CI are:	175.5		[169.8, 182.1]

In [18]:
# Plot the data points
plt.scatter(data['order'].values, data['height_cm'].values)

# The linear function we will use to plot fit coefficients
def linfit(x,slope,intercept):
    return slope*x + intercept

x = data['order'].values
y = data['height_cm'].values

# Choose linear regressions for 10 of the bootstrap samples at random and plot them
ids = npr.randint(0, linregres_posterior.shape[0], 10)
otherfits = [linfit(x, linregres_posterior[i,0], linregres_posterior[i,1]) for i in ids]
for i in otherfits:
    plt.plot(x, i, color='#BBBBBB')

# The fit to the original data
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
plt.plot(x, linfit(x, slope, intercept), color='black', linewidth=2)
    
plt.xlim(0,x.max()+1)
plt.show()


From this plot and the confidence interval on the slope we can confidently say that there is no evidence for a correlation between the two variables.


In [19]:
from statsmodels.nonparametric.smoothers_lowess import lowess

# for some odd reason this loess function takes the y values as the first argument and x as second
test_df = data[['height_cm', 'order']]

posterior_loess = bayes_bstrp1(test_df, lowess, nbstrp=100, samplesize=60)
print posterior_loess


[[[   1.          161.2307358 ]
  [   1.          161.2307358 ]
  [   2.          162.35369256]
  ..., 
  [  36.          184.17097231]
  [  41.          185.58310123]
  [  43.          186.14808695]]

 [[   3.          176.41936072]
  [   4.          176.5736097 ]
  [   5.          176.75774602]
  ..., 
  [  41.          186.72177553]
  [  42.          187.26463385]
  [  42.          187.26463385]]

 [[   1.          185.94217611]
  [   1.          185.94217611]
  [   5.          185.01111205]
  ..., 
  [  41.          186.00813983]
  [  41.          186.00813983]
  [  41.          186.00813983]]

 ..., 
 [[   4.          170.58522278]
  [   4.          170.58522278]
  [   5.          171.18383756]
  ..., 
  [  42.          185.90205238]
  [  42.          185.90205238]
  [  44.          186.77333643]]

 [[   1.          175.28475106]
  [   1.          175.28475106]
  [   1.          175.28475106]
  ..., 
  [  42.          185.44462395]
  [  42.          185.44462395]
  [  42.          185.44462395]]

 [[   1.          173.27495732]
  [   1.          173.27495732]
  [   1.          173.27495732]
  ..., 
  [  43.          184.53737779]
  [  43.          184.53737779]
  [  44.          184.91844895]]]

In [20]:
x = data['order'].values
y = data['height_cm'].values

# To see all the loess curves found:
#for i in posterior_loess:
#    plt.plot(i[:,0], i[:,1], color='#BBBBBB')

ids = npr.randint(0, posterior_loess.shape[0], 20)
for i in ids:
    plt.plot(posterior_loess[i,:,0], posterior_loess[i,:,1], color='#BBBBBB')

plt.scatter(x, y)
    
original_loess = lowess(y, x)
plt.plot(original_loess[:,0], original_loess[:,1], color='black', linewidth=2)
    
plt.xlim(0,x.max()+1)
plt.show()