In [1]:
import os

import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.metrics import r2_score

import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
df = pd.read_pickle('../Chapter 7 - Data Preparation and Visualization/claims_df')

In [4]:
disease = ['SP_ALZHDMTA',
       'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN',
       'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']

In [5]:
disease = ['SP_ALZHDMTA','SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN','SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']
gender = ['gender_2']
ESRD = ['ESRD_Y']

In [6]:
X = df[disease+gender+ESRD]
y = df.TOTAL_LOG_PAID

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=314)

In [8]:
X.columns


Out[8]:
Index(['SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD',
       'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA',
       'SP_STRKETIA', 'gender_2', 'ESRD_Y'],
      dtype='object')

Vanillia Sklearn example


In [9]:
lm = linear_model.LinearRegression()

lm.fit(X_train, y_train)

y_pred_train = lm.predict(X_train)

y_pred_test = lm.predict(X_test)

print('Training R^2: {:,.3}%'.format(r2_score(y_train, y_pred_train)*100))

print('Test R^2: {:,.3}%'.format(r2_score(y_test, y_pred_test)*100))

print('Intercept: {:,.2}'.format(lm.intercept_))

print('Coefficients: ' + ', '.join(['{}: {:,.2}'.format(col, coef_val) for col, coef_val in zip(X.columns, lm.coef_)]))


Training R^2: 50.1%
Test R^2: 49.9%
Intercept: 6.1
Coefficients: SP_ALZHDMTA: 0.35, SP_CHF: 0.42, SP_CHRNKIDN: 0.67, SP_CNCR: 0.54, SP_COPD: 0.42, SP_DEPRESSN: 0.34, SP_DIABETES: 0.49, SP_ISCHMCHT: 0.59, SP_OSTEOPRS: 0.27, SP_RA_OA: 0.39, SP_STRKETIA: 0.41, gender_2: 0.0086, ESRD_Y: 0.23
/Users/andrewwebster/venv/lib/python3.6/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  linalg.lstsq(X, y)

Using actuary package (enhanced sklearn)


In [10]:
from actuary.regression.linear_regression import lm

In [11]:
help(lm)


Help on function lm in module actuary.regression.linear_regression:

lm(X, y, sample_weight=None)
    lm
    
    Enhanced sklearn code for running a linear regression including pandas support and p-values
    Returns a linear model with R^2 automatically calculated as the r2 attribute and T-statistics/p-values as result attribute
    X: A DataFrame containing the independent variables
    y: A ndarray containing the outcome variable
    sample_weight: An optional argument containing sample weights for each row of X


In [12]:
model = lm(X_train, y_train)

In [13]:
model.r2


Out[13]:
'50.1%'

In [14]:
model.result


Out[14]:
coef se t_val p_val
intercept 6.06 0.04 143.44 0.00
SP_ALZHDMTA 0.35 0.05 6.67 0.00
SP_CHF 0.42 0.05 8.60 0.00
SP_CHRNKIDN 0.67 0.06 10.62 0.00
SP_CNCR 0.54 0.08 6.78 0.00
SP_COPD 0.42 0.06 6.84 0.00
SP_DEPRESSN 0.34 0.05 6.90 0.00
SP_DIABETES 0.49 0.05 9.93 0.00
SP_ISCHMCHT 0.59 0.05 12.22 0.00
SP_OSTEOPRS 0.27 0.05 5.21 0.00
SP_RA_OA 0.39 0.06 7.00 0.00
SP_STRKETIA 0.41 0.10 4.29 0.00
gender_2 0.01 0.04 0.20 0.84
ESRD_Y 0.23 0.08 3.00 0.00

In [15]:
model.result.sort_values(by='coef', ascending=False)


Out[15]:
coef se t_val p_val
intercept 6.06 0.04 143.44 0.00
SP_CHRNKIDN 0.67 0.06 10.62 0.00
SP_ISCHMCHT 0.59 0.05 12.22 0.00
SP_CNCR 0.54 0.08 6.78 0.00
SP_DIABETES 0.49 0.05 9.93 0.00
SP_CHF 0.42 0.05 8.60 0.00
SP_COPD 0.42 0.06 6.84 0.00
SP_STRKETIA 0.41 0.10 4.29 0.00
SP_RA_OA 0.39 0.06 7.00 0.00
SP_ALZHDMTA 0.35 0.05 6.67 0.00
SP_DEPRESSN 0.34 0.05 6.90 0.00
SP_OSTEOPRS 0.27 0.05 5.21 0.00
ESRD_Y 0.23 0.08 3.00 0.00
gender_2 0.01 0.04 0.20 0.84

statsmodels approach


In [16]:
X_train_df = pd.DataFrame(X_train, columns=X.columns)

In [17]:
X_train_df.loc[:, 'TOTAL_LOG_PAID'] = y_train

In [18]:
mod = smf.ols(formula='TOTAL_LOG_PAID ~ gender_2 + ESRD_Y + SP_ALZHDMTA + SP_CHF + SP_CHRNKIDN + SP_CNCR + SP_COPD + SP_DEPRESSN + SP_DIABETES + SP_ISCHMCHT + SP_OSTEOPRS + SP_RA_OA + SP_STRKETIA', data=X_train_df)

res = mod.fit()

print(res.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:         TOTAL_LOG_PAID   R-squared:                       0.501
Model:                            OLS   Adj. R-squared:                  0.499
Method:                 Least Squares   F-statistic:                     216.8
Date:                Thu, 21 Jun 2018   Prob (F-statistic):               0.00
Time:                        10:56:33   Log-Likelihood:                -4302.0
No. Observations:                2820   AIC:                             8632.
Df Residuals:                    2806   BIC:                             8715.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       6.0646      0.042    143.445      0.000       5.982       6.147
gender_2        0.0086      0.042      0.205      0.838      -0.074       0.092
ESRD_Y          0.2341      0.078      2.997      0.003       0.081       0.387
SP_ALZHDMTA     0.3498      0.052      6.668      0.000       0.247       0.453
SP_CHF          0.4245      0.049      8.600      0.000       0.328       0.521
SP_CHRNKIDN     0.6666      0.063     10.616      0.000       0.543       0.790
SP_CNCR         0.5354      0.079      6.782      0.000       0.381       0.690
SP_COPD         0.4194      0.061      6.839      0.000       0.299       0.540
SP_DEPRESSN     0.3371      0.049      6.897      0.000       0.241       0.433
SP_DIABETES     0.4866      0.049      9.928      0.000       0.390       0.583
SP_ISCHMCHT     0.5945      0.049     12.222      0.000       0.499       0.690
SP_OSTEOPRS     0.2719      0.052      5.210      0.000       0.170       0.374
SP_RA_OA        0.3922      0.056      6.996      0.000       0.282       0.502
SP_STRKETIA     0.4120      0.096      4.286      0.000       0.224       0.601
==============================================================================
Omnibus:                       59.415   Durbin-Watson:                   1.953
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               75.069
Skew:                          -0.273   Prob(JB):                     5.00e-17
Kurtosis:                       3.583   Cond. No.                         7.50
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [ ]: