In [1]:
%matplotlib inline
In [2]:
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn
In [3]:
np.random.seed(20160418)
In [4]:
npts = 25
X = np.linspace(0, 5, npts) + stats.norm.rvs(loc=0, scale=1, size=npts)
a = 1.0
b = 1.5
Y = b*X + a + stats.norm.rvs(loc=0, scale=2, size=npts)
In [5]:
g = sbn.jointplot(X, Y)
g.set_axis_labels("X", "Y")
pass
What if we wanted to estimate the linear function that relates $Y$ to $X$?
Linear functions are those whose graphs are straight lines. A linear function of a variable $x$ is usually written as:
$$ \hat{y} = f(x) = bx + a $$where $a$ and $b$ are constants. In geometric terms $b$ is the slope of the line and $a$ is the value of the function when $x$ is zero (usually the referred to as the "y-intercept").
There are infinitely many such linear functions of $x$ we could define. Which line should we use if we want to be able to predict $y$?
In [6]:
# calculate regression using built-in scipy.stats.linregress
# we'll show the underlying calculations later in the notebook
rgr = stats.linregress(X,Y)
b = rgr.slope
a = rgr.intercept
# plot scatter
plt.scatter(X, Y, color='steelblue', s=30)
# plot regression line
plt.plot(X, b*X + a, color='indianred', alpha=0.5)
# plot lines from regression to actual value
for (x,y) in zip(X, Y):
plt.vlines(x, y, b*x + a, linestyle='dashed', color='indianred')
plt.xlabel("X")
plt.ylabel("Y")
plt.gcf().set_size_inches(6,6)
plt.title("Linear Least-Squares Regression\nminimizes the sum of squared deviates",fontsize=14)
pass
In [7]:
# here's a simple function to calculate the least squares regression of Y on X
def lsqr_regression(X, Y):
covxy = np.cov(X, Y, ddof=1)[0,1]
varx = np.var(X, ddof=1)
b = covxy/varx
a = np.mean(Y) - b * np.mean(X)
return b, a
def plot_regression_line(X, Y, b, a, axis, **kw):
minx = min(X)
maxx = max(X)
yhatmin = b*minx + a
yhatmax = b*maxx + a
axis.plot((minx,maxx), (yhatmin, yhatmax), marker=None, **kw)
b, a = lsqr_regression(X, Y)
fig, ax = plt.subplots(figsize=(6,6))
ax.scatter(X, Y, color='steelblue')
plot_regression_line(X, Y, b, a, ax, color='indianred', alpha=0.9)
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_title("Regression of Y on X",fontsize=14)
print("Estimated slope, b:", b)
print("Estimated intercept, a:", a)
pass
When the linear regression model is appropriate, residuals should should centered around zero and shoud show no strong trends or extreme differences in spread for different values of $x$.
In [10]:
# yhat is the predicted values of y from x
Yhat = b * X + a
# residuals are the differnce between predicted and actual
residuals = Y - Yhat
In [11]:
plt.scatter(X, residuals, color='steelblue')
plt.hlines(0, min(X)*0.99, max(X)*1.01,color='indianred')
plt.xlabel("X")
plt.ylabel("Residuals")
pass
In [12]:
ssy = np.sum((Y - np.mean(Y))**2)
ssyhat = np.sum((Yhat - np.mean(Yhat))**2)
ssresiduals = np.sum((residuals - np.mean(residuals))**2)
print("SSTotal:", ssy)
print("SSYhat:", ssyhat)
print("SSResiduals:", ssresiduals)
print("SSYhat + SSresiduals:", ssyhat + ssresiduals)
In the same way we did for ANOVA, we can use the sum-of-square decomposition to understand the relative proportion of variance "explained" (accounted for) by the regression model.
We call this quanity the "Coefficient of Determination", and it's designated $R^2$.
$$ R^2 = \left( 1 - \frac{SS_{residuals}}{SS_{total}} \right) $$
In [8]:
regr = stats.linregress(X, Y)
regr
Out[8]:
In [9]:
print("Regression slope: ", regr.slope)
print("Regression intercept: ", regr.intercept)
print("P-value for the null hypothesis that slope = 0:", regr.pvalue)
print("Coefficient of determination, R^2:", regr.rvalue**2)
In [13]:
R2 = (1 - (ssresiduals/ssy))
R2
Out[13]:
In [14]:
npts = 25
x = np.linspace(0,5,npts) + stats.norm.rvs(loc=0, scale=2, size=npts)
A = 1
B = 1.5
slopes = []
intercepts = []
yhat = []
nsims = 1000
for i in range(nsims):
y = B*x + A + stats.norm.rvs(loc=0, scale=2, size=npts)
b = np.corrcoef(x,y,ddof=1)[0,1] * (np.std(y,ddof=1)/np.std(x,ddof=1))
a = np.mean(y) - b * np.mean(x)
yhat.append(b * x + a)
slopes.append(b)
intercepts.append(a)
In [15]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
sbn.distplot(slopes, ax=ax1)
sbn.distplot(intercepts, ax=ax2)
ax1.set_title('Sampling Distribution\nof Regression Slope, b')
ax2.set_title('Sampling Distribution\nof Regression Intercept, a')
pass
In [16]:
for r in yhat:
plt.plot(x, r, alpha=0.0075, color='steelblue', marker=None, linewidth=1)
plt.plot(x, B*x + A, color='indianred', marker=None, linewidth=3)
pass
In [17]:
b_ci95low = np.percentile(slopes, 5)
b_ci95hi = np.percentile(slopes, 95)
a_ci95low = np.percentile(intercepts, 5)
a_ci95hi = np.percentile(intercepts, 95)
print("95% CI for slope of regression:", (b_ci95low, b_ci95hi))
print("95% CI for intercepts of regression:", (a_ci95low, a_ci95hi))
In [18]:
rp = sbn.regplot(X, Y)
pass
In [19]:
url = "http://roybatty.org/iris.csv"
iris = pd.read_csv(url)
iris.columns = iris.columns.str.replace('.',"_")
In [20]:
iris.head()
Out[20]:
In [21]:
setosa = iris.query('Species == "setosa"')
setosa.shape
Out[21]:
In [22]:
g = sbn.jointplot(setosa.Sepal_Length, setosa.Sepal_Width, space=0.15,)
g.set_axis_labels("Sepal Length", "Sepal Width")
plt.subplots_adjust(top=0.85) # adjust top of subplots relative to figure so there is room for title
g.fig.suptitle("Bivariate Distribution of\nSepal Length and Sepal Width\nIris setosa specimens", fontsize=14)
pass
In [23]:
rgr = stats.linregress(setosa.Sepal_Length, setosa.Sepal_Width)
print("Regression slope:", rgr.slope)
print("Regression intercept:", rgr.intercept)
print("Regression R:", rgr.rvalue)
In [24]:
sbn.regplot(setosa.Sepal_Length, setosa.Sepal_Width)
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.title("Linear Least-Squares Regression\nminimizes the sum of squared deviation")
pass
In [25]:
sbn.jointplot(setosa.Sepal_Length, setosa.Sepal_Width, kind="reg")
pass
In [ ]: