notebook.community

Edit and run



In [1]:

    
import os

import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.metrics import r2_score

import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split



In [2]:

    
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('display.float_format', '{:.2f}'.format)



In [3]:

    
df = pd.read_pickle('../Chapter 7 - Data Preparation and Visualization/claims_df')



In [4]:

    
disease = ['SP_ALZHDMTA',
       'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN',
       'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']



In [5]:

    
disease = ['SP_ALZHDMTA','SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN','SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']
gender = ['gender_2']
ESRD = ['ESRD_Y']



In [6]:

    
X = df[disease+gender+ESRD]
y = df.TOTAL_LOG_PAID



In [7]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=314)



In [8]:

    
X.columns









    Out[8]:





Index(['SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD',
       'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA',
       'SP_STRKETIA', 'gender_2', 'ESRD_Y'],
      dtype='object')

Vanillia Sklearn example



In [9]:

    
lm = linear_model.LinearRegression()

lm.fit(X_train, y_train)

y_pred_train = lm.predict(X_train)

y_pred_test = lm.predict(X_test)

print('Training R^2: {:,.3}%'.format(r2_score(y_train, y_pred_train)*100))

print('Test R^2: {:,.3}%'.format(r2_score(y_test, y_pred_test)*100))

print('Intercept: {:,.2}'.format(lm.intercept_))

print('Coefficients: ' + ', '.join(['{}: {:,.2}'.format(col, coef_val) for col, coef_val in zip(X.columns, lm.coef_)]))









    



Training R^2: 50.1%
Test R^2: 49.9%
Intercept: 6.1
Coefficients: SP_ALZHDMTA: 0.35, SP_CHF: 0.42, SP_CHRNKIDN: 0.67, SP_CNCR: 0.54, SP_COPD: 0.42, SP_DEPRESSN: 0.34, SP_DIABETES: 0.49, SP_ISCHMCHT: 0.59, SP_OSTEOPRS: 0.27, SP_RA_OA: 0.39, SP_STRKETIA: 0.41, gender_2: 0.0086, ESRD_Y: 0.23






    



/Users/andrewwebster/venv/lib/python3.6/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  linalg.lstsq(X, y)

Using actuary package (enhanced sklearn)



In [10]:

    
from actuary.regression.linear_regression import lm



In [11]:

    
help(lm)









    



Help on function lm in module actuary.regression.linear_regression:

lm(X, y, sample_weight=None)
    lm
    
    Enhanced sklearn code for running a linear regression including pandas support and p-values
    Returns a linear model with R^2 automatically calculated as the r2 attribute and T-statistics/p-values as result attribute
    X: A DataFrame containing the independent variables
    y: A ndarray containing the outcome variable
    sample_weight: An optional argument containing sample weights for each row of X



In [12]:

    
model = lm(X_train, y_train)



In [13]:

    
model.r2









    Out[13]:





'50.1%'



In [14]:

    
model.result









    Out[14]:







  
    
      
      coef
      se
      t_val
      p_val
    
  
  
    
      intercept
      6.06
      0.04
      143.44
      0.00
    
    
      SP_ALZHDMTA
      0.35
      0.05
      6.67
      0.00
    
    
      SP_CHF
      0.42
      0.05
      8.60
      0.00
    
    
      SP_CHRNKIDN
      0.67
      0.06
      10.62
      0.00
    
    
      SP_CNCR
      0.54
      0.08
      6.78
      0.00
    
    
      SP_COPD
      0.42
      0.06
      6.84
      0.00
    
    
      SP_DEPRESSN
      0.34
      0.05
      6.90
      0.00
    
    
      SP_DIABETES
      0.49
      0.05
      9.93
      0.00
    
    
      SP_ISCHMCHT
      0.59
      0.05
      12.22
      0.00
    
    
      SP_OSTEOPRS
      0.27
      0.05
      5.21
      0.00
    
    
      SP_RA_OA
      0.39
      0.06
      7.00
      0.00
    
    
      SP_STRKETIA
      0.41
      0.10
      4.29
      0.00
    
    
      gender_2
      0.01
      0.04
      0.20
      0.84
    
    
      ESRD_Y
      0.23
      0.08
      3.00
      0.00



In [15]:

    
model.result.sort_values(by='coef', ascending=False)









    Out[15]:







  
    
      
      coef
      se
      t_val
      p_val
    
  
  
    
      intercept
      6.06
      0.04
      143.44
      0.00
    
    
      SP_CHRNKIDN
      0.67
      0.06
      10.62
      0.00
    
    
      SP_ISCHMCHT
      0.59
      0.05
      12.22
      0.00
    
    
      SP_CNCR
      0.54
      0.08
      6.78
      0.00
    
    
      SP_DIABETES
      0.49
      0.05
      9.93
      0.00
    
    
      SP_CHF
      0.42
      0.05
      8.60
      0.00
    
    
      SP_COPD
      0.42
      0.06
      6.84
      0.00
    
    
      SP_STRKETIA
      0.41
      0.10
      4.29
      0.00
    
    
      SP_RA_OA
      0.39
      0.06
      7.00
      0.00
    
    
      SP_ALZHDMTA
      0.35
      0.05
      6.67
      0.00
    
    
      SP_DEPRESSN
      0.34
      0.05
      6.90
      0.00
    
    
      SP_OSTEOPRS
      0.27
      0.05
      5.21
      0.00
    
    
      ESRD_Y
      0.23
      0.08
      3.00
      0.00
    
    
      gender_2
      0.01
      0.04
      0.20
      0.84

statsmodels approach



In [16]:

    
X_train_df = pd.DataFrame(X_train, columns=X.columns)



In [17]:

    
X_train_df.loc[:, 'TOTAL_LOG_PAID'] = y_train



In [18]:

    
mod = smf.ols(formula='TOTAL_LOG_PAID ~ gender_2 + ESRD_Y + SP_ALZHDMTA + SP_CHF + SP_CHRNKIDN + SP_CNCR + SP_COPD + SP_DEPRESSN + SP_DIABETES + SP_ISCHMCHT + SP_OSTEOPRS + SP_RA_OA + SP_STRKETIA', data=X_train_df)

res = mod.fit()

print(res.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:         TOTAL_LOG_PAID   R-squared:                       0.501
Model:                            OLS   Adj. R-squared:                  0.499
Method:                 Least Squares   F-statistic:                     216.8
Date:                Thu, 21 Jun 2018   Prob (F-statistic):               0.00
Time:                        10:56:33   Log-Likelihood:                -4302.0
No. Observations:                2820   AIC:                             8632.
Df Residuals:                    2806   BIC:                             8715.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       6.0646      0.042    143.445      0.000       5.982       6.147
gender_2        0.0086      0.042      0.205      0.838      -0.074       0.092
ESRD_Y          0.2341      0.078      2.997      0.003       0.081       0.387
SP_ALZHDMTA     0.3498      0.052      6.668      0.000       0.247       0.453
SP_CHF          0.4245      0.049      8.600      0.000       0.328       0.521
SP_CHRNKIDN     0.6666      0.063     10.616      0.000       0.543       0.790
SP_CNCR         0.5354      0.079      6.782      0.000       0.381       0.690
SP_COPD         0.4194      0.061      6.839      0.000       0.299       0.540
SP_DEPRESSN     0.3371      0.049      6.897      0.000       0.241       0.433
SP_DIABETES     0.4866      0.049      9.928      0.000       0.390       0.583
SP_ISCHMCHT     0.5945      0.049     12.222      0.000       0.499       0.690
SP_OSTEOPRS     0.2719      0.052      5.210      0.000       0.170       0.374
SP_RA_OA        0.3922      0.056      6.996      0.000       0.282       0.502
SP_STRKETIA     0.4120      0.096      4.286      0.000       0.224       0.601
==============================================================================
Omnibus:                       59.415   Durbin-Watson:                   1.953
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               75.069
Skew:                          -0.273   Prob(JB):                     5.00e-17
Kurtosis:                       3.583   Cond. No.                         7.50
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.



In [ ]:

	coef	se	t_val	p_val
intercept	6.06	0.04	143.44	0.00
SP_ALZHDMTA	0.35	0.05	6.67	0.00
SP_CHF	0.42	0.05	8.60	0.00
SP_CHRNKIDN	0.67	0.06	10.62	0.00
SP_CNCR	0.54	0.08	6.78	0.00
SP_COPD	0.42	0.06	6.84	0.00
SP_DEPRESSN	0.34	0.05	6.90	0.00
SP_DIABETES	0.49	0.05	9.93	0.00
SP_ISCHMCHT	0.59	0.05	12.22	0.00
SP_OSTEOPRS	0.27	0.05	5.21	0.00
SP_RA_OA	0.39	0.06	7.00	0.00
SP_STRKETIA	0.41	0.10	4.29	0.00
gender_2	0.01	0.04	0.20	0.84
ESRD_Y	0.23	0.08	3.00	0.00