Performing logistic regression in Python using:
Bootstrap method application is demonstrated for the purpose of estimating confidence intervals on logistic regression coefficients and predictions.
Performing linear regression in Python using:
Bootstrap method application is demonstrated for the purpose of estimating confidence intervals on linear regression coefficients and predictions.
In [1]:
from __future__ import print_function
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.2)
%matplotlib inline
In [2]:
# load "tips" dataset from the seaborn library
tips = sns.load_dataset('tips')
tips.head()
Out[2]:
In [3]:
# Create a categorical variable for the regression analysis
tips["big_tip"] = (tips.tip / tips.total_bill) > .175
tips['big_tip'] = tips['big_tip'].values.astype(int)
tips.head()
Out[3]:
Seaborn regplot: https://web.stanford.edu/~mwaskom/software/seaborn/generated/seaborn.regplot.html#seaborn.regplot
In [4]:
# Seaborn regplot (logistic=True) <= uses GLM from statsmodels under the hood
fig, ax = plt.subplots(figsize=(10,6))
sns.regplot(x='total_bill', y='big_tip', data=tips, logistic=True, n_boot=500, y_jitter=.03, ax=ax)
plt.show()
In [5]:
# Extract the line data from the seaborn figure
xs, ys = ax.get_lines()[0].get_data()
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(xs, ys, ls='-', c='seagreen', lw=3)
ax.set_xlabel('total_bill')
ax.set_ylabel('big_tip')
plt.show()
In [6]:
X = np.c_[np.ones(tips.shape[0]), tips['total_bill']]
y = tips['big_tip']
Statsmodels GLM: http://statsmodels.sourceforge.net/stable/glm.html
In [7]:
# Statsmodels Logistic Regression using GLM with Binomial family and Logit (default) link
model_sm = sm.GLM(y, X, family=sm.families.Binomial()) # notice the order of the endog and exog variables
res_sm = model_sm.fit()
res_sm.summary()
Out[7]:
In [8]:
# New data for the prediction
support = np.linspace(0, 60, 100)
xnew = np.c_[np.ones(support.size), support] # must be a 2D array
In [9]:
out_sm = res_sm.predict(xnew)
In [10]:
fig, ax = plt.subplots(2, 1, sharex=True, figsize=(10,6))
ax[0].plot(xs, ys, ls='-', c='seagreen', lw=3, label='seaborn')
ax[0].set_ylabel('big_tip')
ax[0].legend(loc='best')
ax[1].plot(support, out_sm, ls='-', c='blue', lw=3, label='Statsmodels GLM')
ax[1].set_xlabel('total_bill')
ax[1].set_ylabel('big_tip')
ax[1].legend(loc='best')
plt.tight_layout()
plt.show()
In [11]:
# Difference between curves
diff = np.sum(abs(ys-out_sm))
print('Total difference: {:g}'.format(diff))
Bootstrap wiki: https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29
In [12]:
# yhat values
yhat = res_sm.fittedvalues
# residuals
resid = res_sm.resid_response
alpha = 0.05 # 95% confidence interval
n_boot = 1000 # No. of bootstrap samples
const = []; x1 = []
const.append(res_sm.params[0]) # constant term
x1.append(res_sm.params[1]) # x1 value
# Bootstrap
for i in range(n_boot):
resid_boot = np.random.choice(resid, size=len(resid), replace=True)
yboot = yhat + resid_boot
model_boot = sm.GLM(yboot, X, family=sm.families.Binomial())
res_boot = model_boot.fit()
const.append(res_boot.params[0])
x1.append(res_boot.params[1])
# Confidence intervals
def ci(var):
coef = np.asarray(var)
c_mean = np.mean(coef)
c_std = np.std(coef)
ql = (alpha/2)*100.
qh = (1 - alpha/2)*100.
ci_low = np.percentile(coef, ql, interpolation='midpoint')
ci_high = np.percentile(coef, qh, interpolation='midpoint')
return c_mean, c_std, ci_low, ci_high
# Const
cm, cs, cl, ch = ci(const)
# Coeff
x1m, x1s, x1l, x1h = ci(x1)
print('Coefficiens of the logistic regression (compare with output from GLM above):')
print('Const: Mean value: {:7.4f}, Std. error: {:7.4f}, 95% Conf. Int.: {:7.4f} to {:7.4f}'.format(cm, cs, cl, ch))
print('x1: Mean value: {:7.4f}, Std. error: {:7.4f}, 95% Conf. Int.: {:7.4f} to {:7.4f}'.format(x1m, x1s, x1l, x1h))
Statsmodels Logit: http://statsmodels.sourceforge.net/stable/generated/statsmodels.discrete.discrete_model.Logit.html
In [13]:
# Statsmodels Logit function
model_logit = sm.Logit(y, X) # notice the order of the endog and exog variables
res_logit = model_logit.fit()
In [14]:
res_logit.summary()
Out[14]:
In [15]:
out_logit = res_logit.predict(xnew)
In [16]:
fig, ax = plt.subplots(2, 1, sharex=True, figsize=(10,6))
ax[0].plot(xs, ys, ls='-', c='seagreen', lw=3, label='seaborn')
ax[0].set_ylabel('big_tip')
ax[0].legend(loc='best')
ax[1].plot(support, out_logit, ls='-', c='blue', lw=3, label='Logit')
ax[1].set_xlabel('total_bill')
ax[1].set_ylabel('big_tip')
ax[1].legend(loc='best')
plt.tight_layout()
plt.show()
In [17]:
# Difference between curves
diff = np.sum(abs(ys-out_logit))
print('Total difference: {:g}'.format(diff))
Scikit-learn LogisticRegression: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
In [18]:
# Scikit-learn Logistic Regression
model_sk = LogisticRegression(C=1e6, fit_intercept=False) # constant term is already in exog
res_sk = model_sk.fit(X, y) # notice the order of the endog and exog variables
res_sk.coef_
Out[18]:
In [19]:
out_sk = res_sk.predict_proba(xnew) # returns a 2D array; second column is important
In [20]:
fig, ax = plt.subplots(2, 1, sharex=True, figsize=(10,6))
ax[0].plot(xs, ys, ls='-', c='seagreen', lw=3, label='seaborn')
ax[0].set_ylabel('big_tip')
ax[0].legend(loc='best')
ax[1].plot(support, out_sk[:,1], ls='-', c='blue', lw=3, label='scikit-learn')
ax[1].set_xlabel('total_bill')
ax[1].set_ylabel('big_tip')
ax[1].legend(loc='best')
plt.tight_layout()
plt.show()
In [21]:
# Difference between curves
diff = np.sum(abs(ys-out_sk[:,1]))
print('Total difference: {:g}'.format(diff))
In [22]:
alpha = 0.05 # 95% confidence interval
n_boot = 1000 # No. of bootstrap samples
y_hat = res_sm.fittedvalues # fittedvalues = np.dot(exog, params)
residuals = res_sm.resid_response # residuals = endog - fittedvalues
values = []
# Bootstrap
for i in range(n_boot):
resid_boot = np.random.choice(residuals, size=len(residuals), replace=True)
yboot = y_hat + resid_boot
model_boot = sm.GLM(yboot, X, family=sm.families.Binomial())
res_boot = model_boot.fit()
# Prediction values
out_boot = res_boot.predict(xnew)
values.append(out_boot)
values = np.asarray(values)
# Means and standard deviations of predicted values
means = np.mean(values, axis=0)
stds = np.std(values, axis=0)
ql = (alpha/2)*100.
qh = (1 - alpha/2)*100.
ci_lows = np.percentile(values, ql, axis=0, interpolation='midpoint')
ci_higs = np.percentile(values, qh, axis=0, interpolation='midpoint')
In [23]:
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(xs, ys, ls='-', c='red', lw=3, label='from seaborn')
ax.plot(support, means, c='blue', ls='-', lw=3, label='from bootstrap')
ax.plot(support, ci_lows, c='blue', ls='--', lw=1, label='CI lower')
ax.plot(support, ci_higs, c='blue', ls='-.', lw=1, label='CI upper')
ax.fill_between(support, ci_lows, ci_higs, facecolor='lightblue', interpolate=True, alpha=0.5)
ax.legend(loc='best')
ax.set_xlabel('total_bill')
ax.set_ylabel('big_tip')
plt.show()
In [24]:
# Jitter data in scatter plot
def jitter(arr, size=0.01):
stdev = size*(max(arr)-min(arr))
return arr + np.random.randn(len(arr)) * stdev
# Comparing bootstrap confidence intervals with seaborn
fig, ax = plt.subplots(1, 2, sharey=True, figsize=(10,6))
sns.regplot(x='total_bill', y='big_tip', data=tips, logistic=True, n_boot=500, y_jitter=.03, ax=ax[0])
ax[1].scatter(tips['total_bill'], jitter(tips['big_tip'], size=0.02), color='blue', alpha=0.5)
ax[1].plot(support, means, c='blue', ls='-', lw=3)
ax[1].fill_between(support, ci_lows, ci_higs, facecolor='lightblue', edgecolor='lightblue', interpolate=True, alpha=0.5)
ax[1].set_xlabel('total_bill')
ax[1].set_xlim(0,60)
plt.tight_layout()
plt.show()
Seaborn regplot: https://web.stanford.edu/~mwaskom/software/seaborn/generated/seaborn.regplot.html#seaborn.regplot
In [25]:
# Seaborn regplot
fig, ax = plt.subplots(figsize=(10,6))
sns.regplot(x='total_bill', y='tip', data=tips, ax=ax)
plt.show()
In [26]:
# Extract the line data from the seaborn figure
xs, ys = ax.get_lines()[0].get_data()
In [27]:
X = np.c_[np.ones(tips.shape[0]), tips['total_bill']]
y = tips['tip']
Statsmodels OLS: http://statsmodels.sourceforge.net/devel/generated/statsmodels.regression.linear_model.OLS.html#statsmodels.regression.linear_model.OLS
In [28]:
# Statsmodels Linear Regression using OLS
model_ols = sm.OLS(y, X) # notice the order of the endog and exog variables
res_ols = model_ols.fit()
res_ols.summary()
Out[28]:
In [29]:
# New data for the prediction
support = np.linspace(0, 60, 100)
xnew = np.c_[np.ones(support.size), support] # must be a 2D array
In [30]:
out_ols = res_ols.predict(xnew)
In [31]:
# Difference between curves
diff = np.sum(abs(ys-out_ols))
print('Total difference: {:g}'.format(diff))
Bootstrap wiki: https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29
Demonstrate using bootstrap to compute the confidence interval on predicted values. Linear regression using OLS from statsmodels.
In [32]:
alpha = 0.05 # 95% confidence interval
n_boot = 1000 # No. of bootstrap samples
y_hat = res_ols.fittedvalues # fittedvalues = np.dot(exog, params)
residuals = res_ols.resid # residuals = endog - fittedvalues
values = []
# Bootstrap
for i in range(n_boot):
resid_boot = np.random.choice(residuals, size=len(residuals), replace=True)
yboot = y_hat + resid_boot
model_boot = sm.OLS(yboot, X)
res_boot = model_boot.fit()
# Prediction values
out_boot = res_boot.predict(xnew)
values.append(out_boot)
values = np.asarray(values)
# Means and standard deviations of predicted values
means = np.mean(values, axis=0)
stds = np.std(values, axis=0)
ql = (alpha/2)*100.
qh = (1 - alpha/2)*100.
ci_lows = np.percentile(values, ql, axis=0, interpolation='midpoint')
ci_higs = np.percentile(values, qh, axis=0, interpolation='midpoint')
In [33]:
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(xs, ys, ls='-', c='red', lw=3, label='from seaborn')
ax.plot(support, means, c='blue', ls='-', lw=3, label='from bootstrap')
ax.plot(support, ci_lows, c='blue', ls='--', lw=1, label='CI lower')
ax.plot(support, ci_higs, c='blue', ls='-.', lw=1, label='CI upper')
ax.fill_between(support, ci_lows, ci_higs, facecolor='lightblue', interpolate=True, alpha=0.5)
ax.legend(loc='best')
ax.set_xlabel('total_bill')
ax.set_ylabel('big_tip')
plt.show()
In [34]:
# Comparing bootstrap confidence intervals with seaborn
fig, ax = plt.subplots(1, 2, sharey=True, figsize=(10,6))
sns.regplot(x='total_bill', y='tip', data=tips, ax=ax[0])
ax[1].scatter(tips['total_bill'], tips['tip'], color='blue', alpha=0.5)
ax[1].plot(support, means, c='blue', ls='-', lw=2)
ax[1].fill_between(support, ci_lows, ci_higs, facecolor='lightblue', edgecolor='lightblue', interpolate=True, alpha=0.5)
ax[1].set_xlabel('total_bill')
ax[1].set_xlim(0,60)
plt.tight_layout()
plt.show()
Ordinary Least Squares: http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/ols.html
In [35]:
from statsmodels.sandbox.regression.predstd import wls_prediction_std
# Computing prediction intervals from OLS regression
prstd, iv_l, iv_u = wls_prediction_std(res_ols, exog=xnew, alpha=0.05) # notice the exog parameter
In [36]:
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(support, means, c='blue', ls='-', lw=3, label='OLS')
ax.plot(support, ci_lows, c='blue', ls='--', lw=1, label='lower confidence limit')
ax.plot(support, ci_higs, c='blue', ls='-.', lw=1, label='upper confidence limit')
ax.fill_between(support, ci_lows, ci_higs, facecolor='lightblue', interpolate=True, alpha=0.5)
ax.plot(support, iv_l, c='green', ls='--', lw=2, label='lower prediction limit')
ax.plot(support, iv_u, c='green', ls='-.', lw=2, label='upper prediction limit')
ax.legend(loc='best')
ax.set_xlabel('total_bill')
ax.set_ylabel('tip')
plt.show()
Scikit-learn LinearRegression: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
In [37]:
# Scikit-learn Linear Regression
model_skl = LinearRegression(fit_intercept=False) # constant term is already in exog
res_skl = model_skl.fit(X, y) # notice the order of the endog and exog variables
In [38]:
res_skl.coef_
Out[38]:
In [39]:
# Predictions
out_skl = res_skl.predict(xnew)
In [40]:
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(xs, ys, ls='-', c='red', lw=2, label='from seaborn')
ax.plot(support, out_ols, c='green', ls='-', lw=2, label='from statsmodels')
ax.plot(support, out_skl, c='blue', ls='-', lw=2, label='from scikit-learn')
ax.legend(loc='best')
ax.set_xlabel('total_bill')
ax.set_ylabel('tip')
plt.show()
In [41]:
# Difference between curves (seaborn vs scikit-learn)
diff = np.sum(abs(ys-out_skl))
print('Total difference: {:g}'.format(diff))
Stackoverflow: https://stackoverflow.com/questions/17559408/confidence-and-prediction-intervals-with-statsmodels
In [42]:
from statsmodels.stats.outliers_influence import summary_table
# Simple example
x = np.linspace(0, 10, 100)
e = np.random.normal(size=100)
y = 1 + 0.5*x + 2*e
X = sm.add_constant(x)
# OLS model
re = sm.OLS(y, X).fit()
# Summary table
st, data, ss2 = summary_table(re, alpha=0.05)
print(ss2) # column names
# Fitted values
fittedvalues = data[:,2]
# Confidence interval
predict_mean_ci_low, predict_mean_ci_upp = data[:,4:6].T
# Prediction interval
predict_ci_low, predict_ci_upp = data[:,6:8].T
In [43]:
fig, ax = plt.subplots(figsize=(10,6))
# Scatter
ax.scatter(x, y, color='blue', alpha=0.5)
# Confidence interval
ax.plot(x, predict_ci_low, c='red', ls='--', lw=2, label='Predict low')
ax.plot(x, predict_ci_upp, c='red', ls='-.', lw=2, label='Predict high')
ax.fill_between(x, predict_ci_low, predict_ci_upp, facecolor='#FF3333', interpolate=True, alpha=0.2)
# Predicition interval
ax.plot(x, predict_mean_ci_low, c='blue', ls='--', lw=2, label='CI low')
ax.plot(x, predict_mean_ci_upp, c='blue', ls='-.', lw=2, label='CI high')
ax.fill_between(x, predict_mean_ci_low, predict_mean_ci_upp, facecolor='lightblue', interpolate=True, alpha=0.8)
# OLS regression line
ax.plot(x, fittedvalues, c='seagreen', ls='-', lw=2, label='OLS')
ax.legend(loc='best')
ax.set_xlim(0, 10)
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.show()
For the linear model:
$y_i = \beta_0 + \beta_1 x_i + \epsilon_i$
the $100(1-\alpha)$ % confidence intervals for $\beta_0$ and $\beta_1$ are given by:
$\hat \beta_0 \pm t_{n-2,\alpha/2} \cdot s_{\beta_0}$
$\hat \beta_1 \pm t_{n-2,\alpha/2} \cdot s_{\beta_1}$
with:
$\hat \beta_1 = {{\sum_{i=1}^n{x_iy_i} - n \bar x \bar y} \over {\sum_{i=1}^n{x_i^2}-n \bar x^2}}$
$\hat \beta_0 = \bar y - \hat \beta_1 \bar x$
and
$s_{\hat \beta_0} = s \cdot \sqrt{{1\over n} + {\bar x^2 \over \sum_{i=1}^n{(x_i-\bar x)^2}}}$
$s_{\hat \beta_1} = {s \over \sqrt{\sum_{i=1}^n{(x_i-\bar x)^2}}}$
$s = \sqrt{{(1-r^2)\sum_{i=1}^n{(y_i-\bar y)^2}}\over{n-2}}$
where:
n - number of samples;
$\bar x$, $\bar y$ - mean values of x and y, respectively
r - coefficient of correlation between x and y
$t_{n-2,\alpha/2}$ - point on the Student's t curve with n-2 degrees of freedom that cuts the area of $\alpha/2$ in the right-hand tail.
This means that the confidence intervals for $\beta_0$ and $\beta_1$ can be derived in exactly the same way as the Student's t based confidence interval for a population mean.
A level $100(1-\alpha)$ % confidence interval for the quantity $\beta_0 + \beta_1 x$ is given by:
$\hat \beta_0 + \hat \beta_1 x \pm t_{n-2,\alpha/2} \cdot s_{\hat y}$
where:
$s_{\hat y} = s \cdot \sqrt{{1\over n} + {(x-\bar x)^2 \over \sum_{i=1}^n{(x_i-\bar x)^2}}}$
A level $100(1-\alpha)$ % prediction interval for the quantity $\beta_0 + \beta_1 x$ is given by:
$\hat \beta_0 + \hat \beta_1 x \pm t_{n-2,\alpha/2} \cdot s_{pred}$
where:
$s_{pred} = s \cdot \sqrt{1 + {1\over n} + {(x-\bar x)^2 \over \sum_{i=1}^n{(x_i-\bar x)^2}}}$
In [44]:
# Example of using Student's t distribution for computing
# confidence levels on linear regression coefficients
n = len(x)
alpha = 0.05 # 95% confidence level
xbar = np.mean(x)
ybar = np.mean(y)
b1 = (np.sum(x*y)-n*xbar*ybar)/(np.sum(x**2)-n*xbar**2) # or b1 = re.params[1] # from OLS
b0 = ybar - b1*xbar # or b0 = re.params[0] # from OLS
# Compute correlation coefficient
# yhat = re.fittedvalues # or yhat = b0 + b1*x
# r = np.sqrt(((np.sum((y-ybar)**2))-(np.sum((y-yhat)**2)))/(np.sum((y-ybar)**2)))
r = np.corrcoef(x,y)[0,1]
s = np.sqrt(((1.-r)**2 * np.sum((y-ybar)**2))/(n-2.))
s_b0 = s*np.sqrt((1./n) + xbar**2/(np.sum((x-xbar)**2)))
s_b1 = s/(np.sqrt(np.sum((x-xbar)**2)))
q = 1. - alpha/2
t = stats.t.ppf(q, n-2)
b0_low = b0 - t*s_b0
b0_high = b0 + t*s_b0
b1_low = b1 - t*s_b1
b1_high = b1 + t*s_b1
print('Const: Mean {:7.4f}, 95% CI {:7.4f} to {:7.4f}'.format(b0, b0_low, b0_high))
print(' x1: Mean {:7.4f}, 95% CI {:7.4f} to {:7.4f}'.format(b1, b1_low, b1_high))
In [45]:
re.summary()
Out[45]: