In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
df = pd.read_pickle('../Chapter 7 - Data Preparation and Visualization/claims_df')

In [4]:
forest = RandomForestRegressor(n_estimators=1, random_state=314)

In [5]:
disease = ['SP_ALZHDMTA','SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN','SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']
gender = ['gender_2']
ESRD = ['ESRD_Y']

In [6]:
X = df[disease+gender+ESRD]
y = df.TOTAL_LOG_PAID

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=314)

In [8]:
X_train.shape


Out[8]:
(2820, 13)

In [9]:
forest.fit(X_train, y_train)


Out[9]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1, n_jobs=1,
           oob_score=False, random_state=314, verbose=0, warm_start=False)

In [10]:
y_pred = pd.DataFrame(forest.predict(X_test), columns=['y_pred'])

In [13]:
y_pred.plot();



In [14]:
first_tree = forest.estimators_[0]

In [15]:
feat_imp = pd.Series(forest.feature_importances_, index=X.columns)

In [16]:
feat_imp.sort_values(ascending=False)


Out[16]:
SP_ISCHMCHT   0.33
SP_CHRNKIDN   0.14
SP_DIABETES   0.12
SP_DEPRESSN   0.07
SP_ALZHDMTA   0.06
SP_CHF        0.06
SP_RA_OA      0.05
gender_2      0.04
SP_OSTEOPRS   0.04
SP_CNCR       0.03
SP_STRKETIA   0.03
SP_COPD       0.02
ESRD_Y        0.02
dtype: float64

In [17]:
forest.predict(X_test)


Out[17]:
array([7.25443132, 8.02943284, 5.6002005 , ..., 5.6002005 , 7.88246977,
       8.03281136])

In [18]:
forest.score(X_train, y_train)


Out[18]:
0.5615344687244734

In [19]:
forest.score(X_test, y_test)


Out[19]:
0.3597939457853613

In [ ]: