In [1]:
import numpy as np
import statsmodels.api as sm
In [3]:
from statsmodels.formula.api import ols
import statsmodels.formula.api as smf
In [4]:
dir(smf)
Out[4]:
In [5]:
dta = sm.datasets.get_rdataset('Guerry','HistData', cache=True)
In [6]:
df = dta.data[['Lottery', 'Literacy', 'Wealth', 'Region']].dropna()
df.head()
Out[6]:
In [9]:
model = ols(formula='Lottery ~ Literacy + Wealth + Region', data=df).fit()
print(model.summary())
Looking at the summary printed above, notice that patsy determined that elements of Region were text strings, so it treated Region as a categorical variable. patsy's default is also to include an intercept, so we automatically dropped one of the Region categories. If Region had been an integer variable that we wanted to treat explicitly as categorical, we could have done so by using the C( ) operator:
In [10]:
res = ols(formula='Lottery ~ Literacy + Wealth + C(Region)', data=df).fit()
print(res.params)
In [12]:
res = ols(formula='Lottery ~ Literacy + Wealth + C(Region) -1 ', data=df).fit()
print(res.params)
In [13]:
res1 = ols(formula='Lottery ~ Literacy : Wealth - 1', data=df).fit()
res2 = ols(formula='Lottery ~ Literacy * Wealth - 1', data=df).fit()
print(res1.params, '\n')
print(res2.params)
In [14]:
res = smf.ols(formula='Lottery ~ np.log(Literacy)', data=df).fit()
print(res.params)
User defined function
In [15]:
def log_plus_1(x):
return np.log(x) + 1.
res = smf.ols(formula='Lottery ~ log_plus_1(Literacy)', data=df).fit()
print(res.params)
In [16]:
import patsy
f = 'Lottery ~ Literacy * Wealth'
y,X = patsy.dmatrices(f, df, return_type='dataframe')
print(y[:5])
print(X[:5])
In [17]:
print(sm.OLS(y, X).fit().summary())