In [ ]:
    
import statsmodels.api as sm
    
This document is based heavily on this excellent resource from UCLA http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm
In [ ]:
    
import pandas
url = 'http://www.ats.ucla.edu/stat/data/hsb2.csv'
hsb2 = pandas.read_table(url, delimiter=",")
    
In [ ]:
    
hsb2.head(10)
    
In [ ]:
    
hsb2.groupby('race')['write'].mean()
    
In [ ]:
    
from patsy.contrasts import Treatment
levels = [1,2,3,4]
contrast = Treatment(reference=0).code_without_intercept(levels)
print contrast.matrix
    
In [ ]:
    
hsb2.race.head(10)
    
In [ ]:
    
print contrast.matrix[hsb2.race-1, :][:20]
    
In [ ]:
    
sm.categorical(hsb2.race.values)
    
In [ ]:
    
from statsmodels.formula.api import ols
mod = ols("write ~ C(race, Treatment)", data=hsb2)
res = mod.fit()
print res.summary()
    
In [ ]:
    
from patsy.contrasts import ContrastMatrix
def _name_levels(prefix, levels):
    return ["[%s%s]" % (prefix, level) for level in levels]
class Simple(object):
    def _simple_contrast(self, levels):
        nlevels = len(levels)
        contr = -1./nlevels * np.ones((nlevels, nlevels-1))
        contr[1:][np.diag_indices(nlevels-1)] = (nlevels-1.)/nlevels
        return contr
    def code_with_intercept(self, levels):
        contrast = np.column_stack((np.ones(len(levels)),
                                    self._simple_contrast(levels)))
        return ContrastMatrix(contrast, _name_levels("Simp.", levels))
    def code_without_intercept(self, levels):
        contrast = self._simple_contrast(levels)
        return ContrastMatrix(contrast, _name_levels("Simp.", levels[:-1]))
    
In [ ]:
    
hsb2.groupby('race')['write'].mean().mean()
    
In [ ]:
    
contrast = Simple().code_without_intercept(levels)
print contrast.matrix
    
In [ ]:
    
mod = ols("write ~ C(race, Simple)", data=hsb2)
res = mod.fit()
print res.summary()
    
In [ ]:
    
from patsy.contrasts import Sum
contrast = Sum().code_without_intercept(levels)
print contrast.matrix
    
In [ ]:
    
mod = ols("write ~ C(race, Sum)", data=hsb2)
res = mod.fit()
print res.summary()
    
In [ ]:
    
hsb2.groupby('race')['write'].mean().mean()
    
In [ ]:
    
from patsy.contrasts import Diff
contrast = Diff().code_without_intercept(levels)
print contrast.matrix
    
In [ ]:
    
mod = ols("write ~ C(race, Diff)", data=hsb2)
res = mod.fit()
print res.summary()
    
In [ ]:
    
res.params["C(race, Diff)[D.1]"]
hsb2.groupby('race').mean()["write"][2] - \
     hsb2.groupby('race').mean()["write"][1]
    
In [ ]:
    
from patsy.contrasts import Helmert
contrast = Helmert().code_without_intercept(levels)
print contrast.matrix
    
In [ ]:
    
mod = ols("write ~ C(race, Helmert)", data=hsb2)
res = mod.fit()
print res.summary()
    
In [ ]:
    
grouped = hsb2.groupby('race')
grouped.mean()["write"][4] - grouped.mean()["write"][:3].mean()
    
In [ ]:
    
k = 4
1./k * (grouped.mean()["write"][k] - grouped.mean()["write"][:k-1].mean())
k = 3
1./k * (grouped.mean()["write"][k] - grouped.mean()["write"][:k-1].mean())
    
In [ ]:
    
hsb2['readcat'] = pandas.cut(hsb2.read, bins=3)
hsb2.groupby('readcat').mean()['write']
    
In [ ]:
    
from patsy.contrasts import Poly
levels = hsb2.readcat.unique().tolist()
contrast = Poly().code_without_intercept(levels)
print contrast.matrix
    
In [ ]:
    
mod = ols("write ~ C(readcat, Poly)", data=hsb2)
res = mod.fit()
print res.summary()