In [1]:
%pylab inline
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

Logistic Regression = Binomial regression with logit function

This notebook shows (empirically) that performing a logistic regression for binary data is equivalent to a binomial regression with logit link.


In [23]:
ungrouped_data = pd.read_csv("../data/Beetles.dat", sep="\t")
ungrouped_data


Out[23]:
x y
0 1.691 1
1 1.691 1
2 1.691 1
3 1.691 1
4 1.691 1
... ... ...
476 1.884 1
477 1.884 1
478 1.884 1
479 1.884 1
480 1.884 1

481 rows × 2 columns


In [34]:
grouped_data = pd.read_csv("../data/Beetles2.dat", sep= "\t")
grouped_data['alive'] = grouped_data['n'] - grouped_data['dead']
grouped_data


Out[34]:
logdose n dead alive
0 1.691 59 6 53
1 1.724 60 13 47
2 1.755 62 18 44
3 1.784 56 28 28
4 1.811 63 52 11
5 1.837 59 53 6
6 1.861 62 61 1
7 1.884 60 60 0

Logistic


In [25]:
import statsmodels.api as sm
#spector_data.exog = sm.add_constant(spector_data.exog)
logit_mod = sm.Logit(ungrouped_data['y'], sm.add_constant(ungrouped_data['x']))
logit_res = logit_mod.fit()


Optimization terminated successfully.
         Current function value: 0.387063
         Iterations 7

In [26]:
print(logit_res.summary())


                           Logit Regression Results                           
==============================================================================
Dep. Variable:                      y   No. Observations:                  481
Model:                          Logit   Df Residuals:                      479
Method:                           MLE   Df Model:                            1
Date:                Fri, 07 Feb 2020   Pseudo R-squ.:                  0.4231
Time:                        16:09:36   Log-Likelihood:                -186.18
converged:                       True   LL-Null:                       -322.72
Covariance Type:            nonrobust   LLR p-value:                 2.411e-61
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -60.7401      5.182    -11.722      0.000     -70.896     -50.584
x             34.2859      2.913     11.769      0.000      28.576      39.996
==============================================================================

Binomial


In [27]:
glm_model = sm.GLM(grouped_data['dead']/grouped_data['n'], sm.add_constant(grouped_data['logdose']), family=sm.families.Binomial(link=sm.families.links.logit),
                   weights = grouped_data['n'])
glm_fit = glm_model.fit()
print(glm_fit.summary())


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                      y   No. Observations:                    8
Model:                            GLM   Df Residuals:                        6
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2.1699
Date:                Fri, 07 Feb 2020   Deviance:                      0.18673
Time:                        16:09:58   Pearson chi2:                    0.166
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -60.4819     40.062     -1.510      0.131    -139.002      18.039
logdose       34.1370     22.522      1.516      0.130     -10.005      78.279
==============================================================================
/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Calling Family(..) with a link class as argument is deprecated.
Use an instance of a link class instead.
  """Entry point for launching an IPython kernel.

The above is an abuse of statsmodels. I don't even know why it works.


In [35]:
glm_model = sm.GLM(grouped_data[['dead', 'alive']] , sm.add_constant(grouped_data['logdose']), family=sm.families.Binomial(link=sm.families.links.logit),
                   )
glm_fit = glm_model.fit()
print(glm_fit.summary())


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:      ['dead', 'alive']   No. Observations:                    8
Model:                            GLM   Df Residuals:                        6
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -18.657
Date:                Fri, 07 Feb 2020   Deviance:                       11.116
Time:                        17:25:29   Pearson chi2:                     9.91
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -60.7401      5.182    -11.722      0.000     -70.896     -50.584
logdose       34.2859      2.913     11.769      0.000      28.576      39.996
==============================================================================
/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Calling Family(..) with a link class as argument is deprecated.
Use an instance of a link class instead.
  """Entry point for launching an IPython kernel.

In [16]:
glm_model = sm.GLM(grouped_data['y'], sm.add_constant(grouped_data['x']), family=sm.families.Binomial(link=sm.families.links.logit))
glm_fit = glm_model.fit()
print(glm_fit.summary())


                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                      y   No. Observations:                  481
Model:                            GLM   Df Residuals:                      479
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -186.18
Date:                Fri, 07 Feb 2020   Deviance:                       372.35
Time:                        12:45:10   Pearson chi2:                     436.
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -60.7401      5.182    -11.722      0.000     -70.896     -50.584
x             34.2859      2.913     11.769      0.000      28.576      39.996
==============================================================================
/home/cmb-06/as/skchoudh/software_frozen/anaconda37/envs/riboraptor/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Calling Family(..) with a link class as argument is deprecated.
Use an instance of a link class instead.
  """Entry point for launching an IPython kernel.

In [ ]: