In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import r2_score
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
In [2]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('display.float_format', '{:.2f}'.format)
In [3]:
df = pd.read_pickle('../Chapter 7 - Data Preparation and Visualization/claims_df')
In [4]:
disease = ['SP_ALZHDMTA',
'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN',
'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']
In [5]:
disease = ['SP_ALZHDMTA','SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN','SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']
gender = ['gender_2']
ESRD = ['ESRD_Y']
In [6]:
X = df[disease+gender+ESRD]
y = df.TOTAL_LOG_PAID
In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=314)
In [8]:
X.columns
Out[8]:
In [9]:
lm = linear_model.LinearRegression()
lm.fit(X_train, y_train)
y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)
print('Training R^2: {:,.3}%'.format(r2_score(y_train, y_pred_train)*100))
print('Test R^2: {:,.3}%'.format(r2_score(y_test, y_pred_test)*100))
print('Intercept: {:,.2}'.format(lm.intercept_))
print('Coefficients: ' + ', '.join(['{}: {:,.2}'.format(col, coef_val) for col, coef_val in zip(X.columns, lm.coef_)]))
In [10]:
from actuary.regression.linear_regression import lm
In [11]:
help(lm)
In [12]:
model = lm(X_train, y_train)
In [13]:
model.r2
Out[13]:
In [14]:
model.result
Out[14]:
In [15]:
model.result.sort_values(by='coef', ascending=False)
Out[15]:
In [16]:
X_train_df = pd.DataFrame(X_train, columns=X.columns)
In [17]:
X_train_df.loc[:, 'TOTAL_LOG_PAID'] = y_train
In [18]:
mod = smf.ols(formula='TOTAL_LOG_PAID ~ gender_2 + ESRD_Y + SP_ALZHDMTA + SP_CHF + SP_CHRNKIDN + SP_CNCR + SP_COPD + SP_DEPRESSN + SP_DIABETES + SP_ISCHMCHT + SP_OSTEOPRS + SP_RA_OA + SP_STRKETIA', data=X_train_df)
res = mod.fit()
print(res.summary())
In [ ]: