In [ ]:
import statsmodels.api as sm
This document is based heavily on this excellent resource from UCLA http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm
In [ ]:
import pandas
url = 'http://www.ats.ucla.edu/stat/data/hsb2.csv'
hsb2 = pandas.read_table(url, delimiter=",")
In [ ]:
hsb2.head(10)
In [ ]:
hsb2.groupby('race')['write'].mean()
In [ ]:
from patsy.contrasts import Treatment
levels = [1,2,3,4]
contrast = Treatment(reference=0).code_without_intercept(levels)
print contrast.matrix
In [ ]:
hsb2.race.head(10)
In [ ]:
print contrast.matrix[hsb2.race-1, :][:20]
In [ ]:
sm.categorical(hsb2.race.values)
In [ ]:
from statsmodels.formula.api import ols
mod = ols("write ~ C(race, Treatment)", data=hsb2)
res = mod.fit()
print res.summary()
In [ ]:
from patsy.contrasts import ContrastMatrix
def _name_levels(prefix, levels):
return ["[%s%s]" % (prefix, level) for level in levels]
class Simple(object):
def _simple_contrast(self, levels):
nlevels = len(levels)
contr = -1./nlevels * np.ones((nlevels, nlevels-1))
contr[1:][np.diag_indices(nlevels-1)] = (nlevels-1.)/nlevels
return contr
def code_with_intercept(self, levels):
contrast = np.column_stack((np.ones(len(levels)),
self._simple_contrast(levels)))
return ContrastMatrix(contrast, _name_levels("Simp.", levels))
def code_without_intercept(self, levels):
contrast = self._simple_contrast(levels)
return ContrastMatrix(contrast, _name_levels("Simp.", levels[:-1]))
In [ ]:
hsb2.groupby('race')['write'].mean().mean()
In [ ]:
contrast = Simple().code_without_intercept(levels)
print contrast.matrix
In [ ]:
mod = ols("write ~ C(race, Simple)", data=hsb2)
res = mod.fit()
print res.summary()
In [ ]:
from patsy.contrasts import Sum
contrast = Sum().code_without_intercept(levels)
print contrast.matrix
In [ ]:
mod = ols("write ~ C(race, Sum)", data=hsb2)
res = mod.fit()
print res.summary()
In [ ]:
hsb2.groupby('race')['write'].mean().mean()
In [ ]:
from patsy.contrasts import Diff
contrast = Diff().code_without_intercept(levels)
print contrast.matrix
In [ ]:
mod = ols("write ~ C(race, Diff)", data=hsb2)
res = mod.fit()
print res.summary()
In [ ]:
res.params["C(race, Diff)[D.1]"]
hsb2.groupby('race').mean()["write"][2] - \
hsb2.groupby('race').mean()["write"][1]
In [ ]:
from patsy.contrasts import Helmert
contrast = Helmert().code_without_intercept(levels)
print contrast.matrix
In [ ]:
mod = ols("write ~ C(race, Helmert)", data=hsb2)
res = mod.fit()
print res.summary()
In [ ]:
grouped = hsb2.groupby('race')
grouped.mean()["write"][4] - grouped.mean()["write"][:3].mean()
In [ ]:
k = 4
1./k * (grouped.mean()["write"][k] - grouped.mean()["write"][:k-1].mean())
k = 3
1./k * (grouped.mean()["write"][k] - grouped.mean()["write"][:k-1].mean())
In [ ]:
hsb2['readcat'] = pandas.cut(hsb2.read, bins=3)
hsb2.groupby('readcat').mean()['write']
In [ ]:
from patsy.contrasts import Poly
levels = hsb2.readcat.unique().tolist()
contrast = Poly().code_without_intercept(levels)
print contrast.matrix
In [ ]:
mod = ols("write ~ C(readcat, Poly)", data=hsb2)
res = mod.fit()
print res.summary()