In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
# see http://statsmodels.sourceforge.net/devel/datasets/generated/randhie.html
rand_data = sm.datasets.randhie.load()

In [3]:
rand_data.names


Out[3]:
['mdvis',
 'lncoins',
 'idp',
 'lpi',
 'fmde',
 'physlm',
 'disea',
 'hlthg',
 'hlthf',
 'hlthp']

In [4]:
plt.hist(rand_data.endog, bins=50);



In [5]:
corrmat = np.corrcoef(rand_data.exog, rowvar=0)  # columns are predictors

In [6]:
corrmat[:,0]


Out[6]:
array([ 1.        , -0.24799029,  0.4046663 ,  0.54992668, -0.01422813,
        0.03181828,  0.01085513, -0.00512905, -0.03135219])

In [7]:
fig, ax = plt.subplots(3,3, figsize=(12,12))
for i in range(3):
    for j in range(3):
        ax[i,j].scatter(rand_data.exog[:,0], rand_data.exog[:,(i+1)*(j+1) - 1]);



In [8]:
X = np.hstack([np.ones(shape=(rand_data.exog.shape[0],1)), rand_data.exog])
y = rand_data.endog

In [9]:
pm = sm.Poisson(y, X)
pm_results = pm.fit(method="newton")
print(pm_results.summary())


Optimization terminated successfully.
         Current function value: 3.091609
         Iterations 12
                          Poisson Regression Results                          
==============================================================================
Dep. Variable:                      y   No. Observations:                20190
Model:                        Poisson   Df Residuals:                    20180
Method:                           MLE   Df Model:                            9
Date:                Wed, 20 Jul 2016   Pseudo R-squ.:                 0.06343
Time:                        11:22:36   Log-Likelihood:                -62420.
converged:                       True   LL-Null:                       -66647.
                                        LLR p-value:                     0.000
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          0.7004      0.011     62.741      0.000         0.678     0.722
x1            -0.0525      0.003    -18.216      0.000        -0.058    -0.047
x2            -0.2471      0.011    -23.272      0.000        -0.268    -0.226
x3             0.0353      0.002     19.302      0.000         0.032     0.039
x4            -0.0346      0.002    -21.439      0.000        -0.038    -0.031
x5             0.2717      0.012     22.200      0.000         0.248     0.296
x6             0.0339      0.001     60.098      0.000         0.033     0.035
x7            -0.0126      0.009     -1.366      0.172        -0.031     0.005
x8             0.0541      0.015      3.531      0.000         0.024     0.084
x9             0.2061      0.026      7.843      0.000         0.155     0.258
==============================================================================

In [10]:
# calculation of pseudo-r squared
(pm_results.llnull - pm_results.llf) / pm_results.llnull


Out[10]:
0.063432436547781079

In [11]:
# overdispersed
np.var(y) > np.mean(y)


Out[11]:
True

In [12]:
nb = sm.NegativeBinomial(y, X)
nb_results = nb.fit(method='lbfgs', maxiter=100)
print(nb_results.summary())


                     NegativeBinomial Regression Results                      
==============================================================================
Dep. Variable:                      y   No. Observations:                20190
Model:               NegativeBinomial   Df Residuals:                    20180
Method:                           MLE   Df Model:                            9
Date:                Wed, 20 Jul 2016   Pseudo R-squ.:                 0.01845
Time:                        11:31:35   Log-Likelihood:                -43384.
converged:                       True   LL-Null:                       -44199.
                                        LLR p-value:                     0.000
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          0.6634      0.025     26.780      0.000         0.615     0.712
x1            -0.0579      0.006     -9.509      0.000        -0.070    -0.046
x2            -0.2675      0.023    -11.786      0.000        -0.312    -0.223
x3             0.0412      0.004      9.931      0.000         0.033     0.049
x4            -0.0381      0.003    -11.218      0.000        -0.045    -0.031
x5             0.2692      0.030      8.990      0.000         0.211     0.328
x6             0.0382      0.001     26.086      0.000         0.035     0.041
x7            -0.0443      0.020     -2.207      0.027        -0.084    -0.005
x8             0.0173      0.036      0.478      0.633        -0.054     0.088
x9             0.1776      0.074      2.392      0.017         0.032     0.323
alpha          1.2931      0.019     69.475      0.000         1.257     1.330
==============================================================================

In [13]:
(nb_results.llnull - nb_results.llf) / nb_results.llnull


Out[13]:
0.018453065779465476

In [ ]: