In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [2]:
# see http://statsmodels.sourceforge.net/devel/datasets/generated/randhie.html
rand_data = sm.datasets.randhie.load()
In [3]:
rand_data.names
Out[3]:
In [4]:
plt.hist(rand_data.endog, bins=50);
In [5]:
corrmat = np.corrcoef(rand_data.exog, rowvar=0) # columns are predictors
In [6]:
corrmat[:,0]
Out[6]:
In [7]:
fig, ax = plt.subplots(3,3, figsize=(12,12))
for i in range(3):
for j in range(3):
ax[i,j].scatter(rand_data.exog[:,0], rand_data.exog[:,(i+1)*(j+1) - 1]);
In [8]:
X = np.hstack([np.ones(shape=(rand_data.exog.shape[0],1)), rand_data.exog])
y = rand_data.endog
In [9]:
pm = sm.Poisson(y, X)
pm_results = pm.fit(method="newton")
print(pm_results.summary())
In [10]:
# calculation of pseudo-r squared
(pm_results.llnull - pm_results.llf) / pm_results.llnull
Out[10]:
In [11]:
# overdispersed
np.var(y) > np.mean(y)
Out[11]:
In [12]:
nb = sm.NegativeBinomial(y, X)
nb_results = nb.fit(method='lbfgs', maxiter=100)
print(nb_results.summary())
In [13]:
(nb_results.llnull - nb_results.llf) / nb_results.llnull
Out[13]:
In [ ]: