In [1]:
%load_ext autoreload
%autoreload 2
import os, sys
sys.path.append('..')
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk")
%load_ext line_profiler
import numpy as np
import pandas as pd
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (11,8)
from sklearn.ensemble import RandomForestRegressor
In [2]:
from merf.utils import MERFDataGenerator
from merf.merf import MERF
from merf.viz import plot_merf_training_stats
In [3]:
dgm = MERFDataGenerator(m=.6, sigma_b=np.sqrt(4.5), sigma_e=1)
In [4]:
num_clusters_each_size = 20
train_sizes = [1, 3, 5, 7, 9]
known_sizes = [9, 27, 45, 63, 81]
new_sizes = [10, 30, 50, 70, 90]
In [5]:
# Smaller set that can be used for debugging
# num_clusters_each_size = 1
# train_sizes = [1, 3]
# known_sizes = [9, 27]
# new_sizes = [10, 30]
In [6]:
train_cluster_sizes = MERFDataGenerator.create_cluster_sizes_array(train_sizes, num_clusters_each_size)
known_cluster_sizes = MERFDataGenerator.create_cluster_sizes_array(known_sizes, num_clusters_each_size)
new_cluster_sizes = MERFDataGenerator.create_cluster_sizes_array(new_sizes, num_clusters_each_size)
In [7]:
len(train_cluster_sizes), len(known_cluster_sizes), len(new_cluster_sizes)
Out[7]:
In [8]:
train, test_known, test_new, training_cluster_ids, ptev, prev = dgm.generate_split_samples(train_cluster_sizes, known_cluster_sizes, new_cluster_sizes)
In [9]:
len(train), len(test_known), len(test_new)
Out[9]:
In [10]:
train.head()
Out[10]:
In [11]:
X_train = train[['X_0', 'X_1', 'X_2']]
Z_train = train[['Z']]
clusters_train = train['cluster']
y_train = train['y']
In [12]:
val = pd.concat([test_known, test_new])
X_val = val[['X_0', 'X_1', 'X_2']]
Z_val = val[['Z']]
clusters_val = val['cluster']
y_val = val['y']
In [13]:
mrf = MERF(max_iterations=5)
mrf.fit(X_train, Z_train, clusters_train, y_train)
Out[13]:
In [14]:
plot_merf_training_stats(mrf, num_clusters_to_plot=10)
In [15]:
# With validation loss
mrf = MERF(max_iterations=15)
mrf.fit(X_train, Z_train, clusters_train, y_train, X_val, Z_val, clusters_val, y_val)
Out[15]:
In [16]:
plot_merf_training_stats(mrf, num_clusters_to_plot=10)
In [17]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
In [18]:
mrf_lgbm = MERF(lgbm, max_iterations=15)
mrf_lgbm.fit(X_train, Z_train, clusters_train, y_train)
Out[18]:
In [19]:
# With validation loss
mrf_lgbm = MERF(lgbm, max_iterations=15)
mrf_lgbm.fit(X_train, Z_train, clusters_train, y_train, X_val, Z_val, clusters_val, y_val)
Out[19]:
In [20]:
plot_merf_training_stats(mrf_lgbm, num_clusters_to_plot=10)
In [321]:
X_known = test_known[['X_0', 'X_1', 'X_2']]
Z_known = test_known[['Z']]
clusters_known = test_known['cluster']
y_known = test_known['y']
In [322]:
y_hat_known = mrf.predict(X_known, Z_known, clusters_known)
y_hat_known
Out[322]:
In [323]:
assert len(y_hat_known) == len(y_known)
In [324]:
X_new = test_new[['X_0', 'X_1', 'X_2']]
Z_new = test_new[['Z']]
clusters_new = test_new['cluster']
y_new = test_new['y']
In [325]:
y_hat_new = mrf.predict(X_new, Z_new, clusters_new)
y_hat_new
Out[325]:
In [326]:
assert len(y_hat_new) == len(y_new)
In [301]:
mrf.trained_fe_model
Out[301]:
In [302]:
from sklearn.inspection import plot_partial_dependence
In [303]:
features = [0, 1]
plot_partial_dependence(mrf.trained_fe_model, X_known, features)
Out[303]:
In [304]:
import shap
In [135]:
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(mrf.trained_fe_model)
shap_values = explainer.shap_values(X_known)
In [136]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X_known)
In [137]:
shap.summary_plot(shap_values, X_known, plot_type="bar")
In [138]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot('X_0', shap_values, X_known)
In [ ]:
# Train and test using classic random forest.
from sklearn.ensemble import RandomForestRegressor
import numpy as np
In [ ]:
rf = RandomForestRegressor(n_estimators=300, n_jobs=-1)
In [ ]:
rf.fit(X_train, y_train)
In [ ]:
y_hat_known_rf = rf.predict(X_known)
In [ ]:
pmse_known_rf = np.mean((y_known - y_hat_known_rf) ** 2)
pmse_known_rf
In [ ]:
y_hat_known_merf = mrf.predict(X_known, Z_known, clusters_known)
In [ ]:
pmse_known_merf = np.mean((y_known - y_hat_known_merf) ** 2)
pmse_known_merf
In [ ]:
rd_known = 100 * (pmse_known_rf - pmse_known_merf) / pmse_known_rf
rd_known
In [ ]:
y_hat_new_rf = rf.predict(X_new)
In [ ]:
pmse_new_rf = np.mean((y_new - y_hat_new_rf) ** 2)
pmse_new_rf
In [ ]:
y_hat_new_merf = mrf.predict(X_new, Z_new, clusters_new)
In [ ]:
pmse_new_merf = np.mean((y_new - y_hat_new_merf) ** 2)
pmse_new_merf
In [ ]:
rd_new = 100 * (pmse_new_rf - pmse_new_merf) / pmse_new_rf
rd_new
In [ ]:
rf_ohe = RandomForestRegressor(n_estimators=300, n_jobs=-1)
In [ ]:
categories = np.sort(clusters_train.unique())
In [ ]:
clusters_train_prime = pd.Categorical(clusters_train, categories=categories)
In [ ]:
X_ohe = pd.get_dummies(clusters_train_prime, prefix='cluster')
In [ ]:
X_ohe.head()
In [ ]:
X_train_ohe = pd.merge(X_train, X_ohe, left_index=True, right_index=True)
In [ ]:
rf_ohe.fit(X_train_ohe, y_train)
In [ ]:
clusters_known_prime = pd.Categorical(clusters_known, categories=categories)
In [ ]:
X_known_ohe = pd.get_dummies(clusters_known_prime, prefix='cluster')
In [ ]:
X_known_ohe.head()
In [ ]:
X_known_w_ohe = pd.merge(X_known, X_known_ohe, left_index=True, right_index=True)
In [ ]:
y_hat_known_rf_ohe = rf_ohe.predict(X_known_w_ohe)
In [ ]:
mse_known_rf_ohe = np.mean((y_known - y_hat_known_rf_ohe) ** 2)
mse_known_rf_ohe
In [ ]:
clusters_new_prime = pd.Categorical(clusters_new, categories=categories)
In [ ]:
X_new_ohe = pd.get_dummies(clusters_new_prime, prefix='cluster')
In [ ]:
X_new_ohe.head()
In [ ]:
X_new_w_ohe = pd.merge(X_new, X_new_ohe, left_index=True, right_index=True)
In [ ]:
y_hat_new_rf_ohe = rf_ohe.predict(X_new_w_ohe)
In [ ]:
mse_new_rf_ohe = np.mean((y_new - y_hat_new_rf_ohe) ** 2)
mse_new_rf_ohe
In [ ]: