In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
In [2]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('display.float_format', '{:.2f}'.format)
In [3]:
df = pd.read_pickle('../Chapter 7 - Data Preparation and Visualization/claims_df')
In [4]:
forest = RandomForestRegressor(n_estimators=1, random_state=314)
In [5]:
disease = ['SP_ALZHDMTA','SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN','SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']
gender = ['gender_2']
ESRD = ['ESRD_Y']
In [6]:
X = df[disease+gender+ESRD]
y = df.TOTAL_LOG_PAID
In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=314)
In [8]:
X_train.shape
Out[8]:
In [9]:
forest.fit(X_train, y_train)
Out[9]:
In [10]:
y_pred = pd.DataFrame(forest.predict(X_test), columns=['y_pred'])
In [13]:
y_pred.plot();
In [14]:
first_tree = forest.estimators_[0]
In [15]:
feat_imp = pd.Series(forest.feature_importances_, index=X.columns)
In [16]:
feat_imp.sort_values(ascending=False)
Out[16]:
In [17]:
forest.predict(X_test)
Out[17]:
In [18]:
forest.score(X_train, y_train)
Out[18]:
In [19]:
forest.score(X_test, y_test)
Out[19]:
In [ ]: