In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from lime.lime_tabular import LimeTabularExplainer
pd.set_option('display.max_columns', None)
In [2]:
# Class to help select categorical vs. continuous data as part of the pipeline (see below)
class DataSelector(BaseEstimator, TransformerMixin):
'''Select columns of numpy arrays based on attribute_indices.'''
def __init__(self, attribute_indices):
self.attribute_indices = attribute_indices
def fit(self, X, y=None):
return self
def transform(self, X):
return np.array(X)[:,self.attribute_indices]
In [3]:
# Load (preprocessed) data
#
# The raw data was downloaded from https://data.stanford.edu/hcmst and preprocessed.
# We combined data sets collected across several years, we transformed select variables
# (e.g., partner_education to be at the same level of granularity as education),
# and added variables like the absolute age difference, education difference, etc.
# Finally, we determined whether couples were still together (i.e., our labels).
#
# We provide the preprocessed data as a csv file in the same repo as this notebook.
df = pd.read_csv('couples.csv')
# Order features (numeric first, categorical second) since it's convenient later
feature_order = ['age',
'partner_age',
'age_diff_abs',
'children',
'visits_relatives',
'education',
'marital_status',
'partner_education',
'gender',
'house',
'income',
'msa',
'rent',
'political',
'religion',
'work',
'gender_older',
'education_difference',
'success']
data = df[feature_order]
data = data[data.house != 'boat, rv, van, etc.'] # only one data point with this value, discard
labels = data.pop('success')
In [4]:
# Take a peak at the data
data.head()
Out[4]:
In [5]:
# Define categorical names and indices
categorical_features = list(data.columns[5:])
categorical_idx = list(range(5, len(data.columns)))
continuous_features = list(data.columns[0:5])
continuous_idx = list(range(0,5))
X = data.values
# Get feature names and their values for categorical data (needed for LIME)
categorical_names = {}
for idx, feature in zip(categorical_idx, categorical_features):
le = LabelEncoder()
X[:, idx] = le.fit_transform(X[:, idx])
categorical_names[idx] = le.classes_
# To suppress a warning later (not strictly necessary)
X = X.astype(float)
# Train test split
train, test, labels_train, labels_test = train_test_split(
X, labels, train_size=0.70, random_state=42
)
In [6]:
# Preprocessing pipeline
#
# LIME needs a function that takes raw inputs and returns a prediction (see below).
# We use sklearn's pipeline to handle preprocessing, it simplifies the interaction with LIME (see below).
# There are several ways to build this pipeline. For demo purposes, we here show the verbose option (and we
# avoid scaling one-hot encoded features).
continuous_pipeline = Pipeline([
('selector', DataSelector(continuous_idx)),
('scaler', StandardScaler()),
])
categorical_pipeline = Pipeline([
('selector', DataSelector(categorical_idx)),
('encoder', OneHotEncoder(sparse=False)),
])
preprocessing_pipeline = FeatureUnion(transformer_list=[
("continuous_pipeline", continuous_pipeline),
("categorical_pipeline", categorical_pipeline),
])
# There are less verbose alternatives, especially if we scale one-hot encoded features,
# an accepted practice in the machine learning community:
#
# preprocessing_pipeline = Pipeline([
# ('onehotencoder', OneHotEncoder(categorical_features=categorical_idx, sparse=False)),
# ('scaler', StandardScaler())
# ])
#
# Finally, instead of the low-level Pipeline constructor, we can use sklearn's makepipeline:
#
# preprocessing_pipeline = make_pipeline(
# OneHotEncoder(categorical_features=categorical_idx, sparse=False),
# StandardScaler()
# )
In [7]:
# Set up the model and GridSearch for random forest hyperparameter tuning
param_grid = [{'n_estimators': [16, 20, 24],
'max_features': [4, 8, 12, 16],
'min_samples_split': [8, 12, 16],
'max_depth': [15, 20, 25]}]
rf = RandomForestClassifier(class_weight='balanced')
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')
# Extend the preprocessing pipeline to include random forest and grid search
pipeline = make_pipeline(preprocessing_pipeline, grid_search)
# Fit the model and tune the hyperparameters
pipeline.fit(train, labels_train)
Out[7]:
In [8]:
# Hyperparameters found by GridSearchCV
print(pipeline.named_steps['gridsearchcv'].best_params_)
In [9]:
# Evalute random forest classifier on training data (it overfits, small sample size)
y_predict = pipeline.predict(train)
f1 = f1_score(labels_train, y_predict)
print('F1 on train:', f1)
# Evalute random forest classifier on test data
y_predict = pipeline.predict(test)
f1 = f1_score(labels_test, y_predict)
print('F1 on test:', f1)
In [22]:
from sklearn.metrics import roc_auc_score
roc_auc_score(labels_test, pipeline.predict(test))
Out[22]:
In [10]:
# Get feature importances of random forest model ("global interpretability")
best_estimator = pipeline.named_steps['gridsearchcv'].best_estimator_
importances = best_estimator.feature_importances_
std = np.std([tree.feature_importances_ for tree in best_estimator.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
feature_names = data.columns
for f in range(data.shape[1]):
print("%2d. feature %s (%f)" %
(f + 1, feature_names[f], importances[indices[f]]))
In [11]:
# Use LIME to explain individual predictions, initialize explainer object
explainer = LimeTabularExplainer(
train,
class_names=['BrokeUp', 'StayedTogether'],
feature_names=list(data.columns),
categorical_features=categorical_idx,
categorical_names=categorical_names,
discretize_continuous=True
)
In [12]:
# Explain a prediction ("local interpretability"):
# Now we see that the pipeline that takes raw data and returns the prediction
# of the trained model now comes in conveniently.
example = 3
exp = explainer.explain_instance(test[example], pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()
Out[12]:
In [13]:
# Explain another prediction ("local interpretability"):
example = 13
exp = explainer.explain_instance(test[example], pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()
Out[13]:
In [14]:
# and we see differences in explaining the model's predictions.
In [15]:
# Using LIME for for relationship management (not advised): Current chance of relationship success.
current = [34, 36, 2, 0, 1, 0, 1, 0, 0, 0, 18, 0, 2, 0, 8, 4, 0, 0]
exp = explainer.explain_instance(np.array(current), pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()
Out[15]:
In [16]:
# Should I ask for a pay increase? It doesn't matter much.
increase_income = [34, 36, 2, 0, 1, 0, 1, 0, 0, 0, 6, 0, 2, 0, 8, 4, 0, 0]
exp = explainer.explain_instance(np.array(increase_income), pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()
Out[16]:
In [17]:
# Should I buy a house? Maybe?
buy_house = [34, 36, 2, 0, 1, 0, 1, 0, 0, 0, 6, 0, 1, 0, 8, 4, 0, 0]
exp = explainer.explain_instance(np.array(buy_house), pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()
Out[17]:
In [18]:
# Really, it's best to get married.
get_married = [34, 36, 2, 0, 1, 0, 2, 0, 0, 0, 6, 0, 1, 0, 8, 4, 0, 0]
exp = explainer.explain_instance(np.array(get_married), pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()
Out[18]:
In [ ]: