In [2]:
%matplotlib inline
In [1]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sbn
# the formula API allow us to write R-like formulas for regression models
import statsmodels as sm
import statsmodels.stats.anova as anova
import statsmodels.formula.api as smf
In [3]:
np.random.seed(20160425)
sbn.set_style("white")
In [4]:
n = 25
x = np.linspace(-5, 5, n) + stats.norm.rvs(loc=0, scale=1, size=n)
a, b = 1, 0.75
# I've chosen values to make yind and ydep have about the same variance
yind = a + stats.norm.rvs(loc=0, scale=np.sqrt(8), size=n)
ydep = a + b*x + stats.norm.rvs(loc=0, scale=1, size=n)
# create two different data frames for ease of use with statsmodels
data_ind = pd.DataFrame(dict(x = x, y = yind))
data_dep = pd.DataFrame(dict(x = x, y = ydep))
And we plot the data sets
In [5]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4), sharex=True, sharey=True)
ax1.scatter(x, yind, s=60, alpha=0.75, color='steelblue')
ax2.scatter(x, ydep, s=60, alpha=0.75, color='steelblue')
ax1.set_xlabel("X",fontsize=15)
ax1.set_ylabel("Y",fontsize=15)
ax1.set_title("Y independent of X", fontsize=18)
ax2.set_xlabel("X",fontsize=15)
ax2.set_title("Y dependent on X", fontsize=18)
pass
In [6]:
fit_ind = smf.ols('y ~ x', data_ind).fit()
fit_dep = smf.ols('y ~ x', data_dep).fit()
In [7]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4), sharex=True, sharey=True)
ax1.scatter(x, yind, s=60, alpha=0.75, color='steelblue')
ax1.plot(x, fit_ind.predict(), color='firebrick', alpha=0.5)
ax2.scatter(x, ydep, s=60, alpha=0.75, color='steelblue')
ax2.plot(x, fit_dep.predict(), color='firebrick', alpha=0.5)
ax1.set_xlabel("X",fontsize=15)
ax1.set_ylabel("Y",fontsize=15)
ax1.set_title("Y independent of X", fontsize=18)
ax2.set_xlabel("X",fontsize=15)
ax2.set_title("Y dependent on X", fontsize=18)
pass
In [8]:
def sum_squares(x):
x = np.asarray(x)
return np.sum((x - np.mean(x))**2)
def bivariate_regression_table(fit):
""" A function to create an ANOVA-like table for a bivariate regression"""
df_model = fit.df_model
df_resid = fit.df_resid
df_total = df_model + df_resid
SStotal = sum_squares(fit.model.endog)
SSmodel = sum_squares(fit.predict())
SSresid = sum_squares(fit.resid)
MSmodel = SSmodel / df_model
MSresid = SSresid / df_resid
Fstat = MSmodel / MSresid
Pval = stats.f.sf(Fstat, df_model, df_resid)
Ftable = pd.DataFrame(index=["Model","Residuals","Total"],
columns=["df", "SS", "MS", "F", "Pval"],
data = dict(df = [df_model, df_resid, df_total],
SS = [SSmodel, SSresid, SStotal],
MS = [MSmodel, MSresid, ""],
F = [Fstat, "", ""],
Pval = [Pval, "", ""]))
return Ftable
In [9]:
Ftable_ind = bivariate_regression_table(fit_ind)
Ftable_ind
Out[9]:
In [10]:
Ftable_dep = bivariate_regression_table(fit_dep)
Ftable_dep
Out[10]:
To setup ANOVA for two-groups as a regression problem, we use "dummy coding" where we incorporate group information into a predictor variable.
In [11]:
Y1 = stats.norm.rvs(loc=0, scale=1, size=10)
Y2 = stats.norm.rvs(loc=1, scale=1, size=10)
Y = np.concatenate([Y1,Y2])
groups = [-1]*10 + [1]*10 # setup dummy variable to represent grouping
data = pd.DataFrame(dict(Y = Y,
group = groups))
In [12]:
data.head(3)
Out[12]:
In [13]:
data.tail(3)
Out[13]:
In [14]:
data.corr()
Out[14]:
In [34]:
sbn.stripplot(x="group", y="Y", hue="group", data=data,s=10)
pass
Fit regression model
In [16]:
fit_data = smf.ols('Y ~ group', data).fit()
Plot regression
In [17]:
plt.scatter(data.group[data.group == -1], data.Y[data.group == -1], s=60, alpha=0.75, color='steelblue')
plt.scatter(data.group[data.group == 1], data.Y[data.group == 1], s=60, alpha=0.75, color='forestgreen')
groups = [-1,1]
predicted = fit_data.predict(dict(group=groups))
plt.plot(groups, predicted, color='firebrick', alpha=0.75)
plt.xticks([-1,1])
plt.xlabel("Group",fontsize=15)
plt.ylabel("Y", fontsize=15)
pass
Compare regression F-statistic and corresponding p-value to that from ANOVA
In [18]:
fit_data.fvalue, fit_data.f_pvalue
Out[18]:
In [19]:
stats.f_oneway(data.Y[data.group == -1], data.Y[data.group == 1])
Out[19]:
When we want to consider multiple groups, we have to extend the idea of dummy coding to allow for more groups. We can setup simple dummy variables for $g-1$ groups, or we can use a slight variant called "effect coding". The results when using effect coding are usually easier to interpret, and that's what we'll use this here.
The two links below contrast dummy and effect coding:
In [4]:
iris = pd.read_csv("http://roybatty.org/iris.csv")
iris.Species.unique()
Out[4]:
In [5]:
iris.columns = iris.columns.str.replace('.','')
iris.columns
Out[5]:
In [8]:
sbn.violinplot(x="Species", y="SepalLength", data=iris)
Out[8]:
In [9]:
effect1 = []
effect2 = []
for s in iris.Species:
if s == 'setosa':
effect1.append(1)
effect2.append(0)
elif s == 'versicolor':
effect1.append(0)
effect2.append(1)
else:
effect1.append(-1)
effect2.append(-1)
In [10]:
print(effect1)
In [12]:
print(effect2)
In [24]:
# add effect variables to iris data frame
iris.effect1 = effect1
iris.effect2 = effect2
In [ ]:
In [25]:
iris_fit = smf.ols("SepalLength ~ effect1 + effect2", iris).fit()
In [26]:
iris_fit.fvalue, iris_fit.f_pvalue
Out[26]:
In [27]:
stats.f_oneway(iris.SepalLength[iris.Species == "setosa"],
iris.SepalLength[iris.Species == "versicolor"],
iris.SepalLength[iris.Species == "virginica"])
Out[27]:
If using statsmodels
you don't need to explicit create the effect coding variables like we did above. You can specify that a variable is a categorical variables in the formula itself.
In [28]:
iris_fit2 = smf.ols('SepalLength ~ C(Species)', iris).fit()
You can then pass the model fit results to statsmodels.stats.anova.anova_lm
to get an appropriate ANOVA table.
In [29]:
anova.anova_lm(iris_fit2)
Out[29]:
In [ ]: