Learn statsmodels: getting_started


In [ ]:
%matplotlib inline

from __future__ import print_function
import pandas
# patsy: descriping the statistical models and design matrices like R-formulas
from patsy import dmatrices
import statsmodels.api as sm

In [ ]:
data = sm.datasets.get_rdataset("Guerry", "HistData", cache=True)
print(data.__doc__)
print(type(data), type(data.data))
print(data.data.columns)

In [ ]:
vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']
df = data.data[vars]
df[-5:]

In [ ]:
df.dropna()[-5:]

In [ ]:
# 问题: 有文化是否和皇家彩票中奖率有关
y, X = dmatrices('Lottery ~ Literacy + Wealth + Region', data=data.data, return_type='dataframe')

In [ ]:
y[-5:]

In [ ]:
X[-5:]

In [ ]:
# Model Fit
# 1. select class to describe the model
mod = sm.OLS(endog=y, exog=X)
# 2. fit
res = mod.fit()
res.summary()

In [ ]:
dir(res)

In [ ]:
sm.stats.linear_rainbow(res)

In [ ]:
# error
sm.graphics.plot_partregress('Lottery', 'Wealth', ['Region', 'Literacy'], data=data.data, obs_labels=False)