notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from lime.lime_tabular import LimeTabularExplainer

pd.set_option('display.max_columns', None)



In [2]:

    
# Class to help select categorical vs. continuous data as part of the pipeline (see below)
class DataSelector(BaseEstimator, TransformerMixin):
    '''Select columns of numpy arrays based on attribute_indices.'''

    def __init__(self, attribute_indices):
        self.attribute_indices = attribute_indices

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array(X)[:,self.attribute_indices]



In [3]:

    
# Load (preprocessed) data
# 
# The raw data was downloaded from https://data.stanford.edu/hcmst and preprocessed.
# We combined data sets collected across several years, we transformed select variables 
# (e.g., partner_education to be at the same level of granularity as education),
# and added variables like the absolute age difference, education difference, etc.
# Finally, we determined whether couples were still together (i.e., our labels).
#
# We provide the preprocessed data as a csv file in the same repo as this notebook.

df = pd.read_csv('couples.csv')

# Order features (numeric first, categorical second) since it's convenient later
feature_order = ['age',
                 'partner_age',
                 'age_diff_abs',
                 'children',
                 'visits_relatives',
                 'education',
                 'marital_status',
                 'partner_education',
                 'gender',
                 'house',
                 'income',
                 'msa',
                 'rent',
                 'political',
                 'religion',
                 'work',
                 'gender_older',
                 'education_difference',
                 'success']

data = df[feature_order]
data = data[data.house != 'boat, rv, van, etc.'] # only one data point with this value, discard

labels = data.pop('success')



In [4]:

    
# Take a peak at the data
data.head()









    Out[4]:







  
    
      
      age
      partner_age
      age_diff_abs
      children
      visits_relatives
      education
      marital_status
      partner_education
      gender
      house
      income
      msa
      rent
      political
      religion
      work
      gender_older
      education_difference
    
  
  
    
      0
      52
      48
      4
      0.0
      0
      bachelor's degree or higher
      living with partner
      some college
      female
      a building with 2 or more apartments
      $20,000 to $24,999
      metro
      rented for cash
      democrat
      catholic
      working - as a paid employee
      1
      1
    
    
      1
      28
      30
      2
      0.0
      0
      bachelor's degree or higher
      living with partner
      bachelor's degree or higher
      female
      a building with 2 or more apartments
      $40,000 to $49,999
      metro
      rented for cash
      democrat
      jewish
      working - as a paid employee
      0
      0
    
    
      2
      31
      40
      9
      0.0
      1
      some college
      never married
      high school
      male
      a building with 2 or more apartments
      $40,000 to $49,999
      metro
      owned or being bought by you or someone in you...
      democrat
      other non-christian, please specify:
      working - as a paid employee
      1
      1
    
    
      3
      53
      55
      2
      0.0
      1
      bachelor's degree or higher
      living with partner
      bachelor's degree or higher
      male
      a one-family house detached from any other house
      $125,000 to $149,999
      metro
      owned or being bought by you or someone in you...
      democrat
      protestant (e.g., methodist, lutheran, presbyt...
      working - as a paid employee
      1
      0
    
    
      4
      58
      51
      7
      0.0
      0
      bachelor's degree or higher
      separated
      bachelor's degree or higher
      male
      a building with 2 or more apartments
      $15,000 to $19,999
      metro
      rented for cash
      democrat
      protestant (e.g., methodist, lutheran, presbyt...
      working - as a paid employee
      0
      0



In [5]:

    
# Define categorical names and indices
categorical_features = list(data.columns[5:])
categorical_idx = list(range(5, len(data.columns)))
continuous_features = list(data.columns[0:5])
continuous_idx = list(range(0,5))

X = data.values

# Get feature names and their values for categorical data (needed for LIME)
categorical_names = {}
for idx, feature in zip(categorical_idx, categorical_features):
    le = LabelEncoder()
    X[:, idx] = le.fit_transform(X[:, idx])
    categorical_names[idx] = le.classes_

# To suppress a warning later (not strictly necessary)
X = X.astype(float)

# Train test split
train, test, labels_train, labels_test = train_test_split(
    X, labels, train_size=0.70, random_state=42
)



In [6]:

    
# Preprocessing pipeline
#      
# LIME needs a function that takes raw inputs and returns a prediction (see below).     
# We use sklearn's pipeline to handle preprocessing, it simplifies the interaction with LIME (see below). 
# There are several ways to build this pipeline. For demo purposes, we here show the verbose option (and we
# avoid scaling one-hot encoded features).

continuous_pipeline = Pipeline([
    ('selector', DataSelector(continuous_idx)),
    ('scaler', StandardScaler()),
    ])

categorical_pipeline = Pipeline([
    ('selector', DataSelector(categorical_idx)),
    ('encoder', OneHotEncoder(sparse=False)),
    ])

preprocessing_pipeline = FeatureUnion(transformer_list=[
    ("continuous_pipeline", continuous_pipeline),
    ("categorical_pipeline", categorical_pipeline),
    ])

# There are less verbose alternatives, especially if we scale one-hot encoded features,
# an accepted practice in the machine learning community:
#
#     preprocessing_pipeline = Pipeline([
#        ('onehotencoder', OneHotEncoder(categorical_features=categorical_idx, sparse=False)),
#        ('scaler', StandardScaler())
#     ])
#
# Finally, instead of the low-level Pipeline constructor, we can use sklearn's makepipeline:
#
#     preprocessing_pipeline = make_pipeline(
#         OneHotEncoder(categorical_features=categorical_idx, sparse=False),
#         StandardScaler()
#     )



In [7]:

    
# Set up the model and GridSearch for random forest hyperparameter tuning
param_grid = [{'n_estimators': [16, 20, 24],
               'max_features': [4, 8, 12, 16],
               'min_samples_split': [8, 12, 16],
               'max_depth': [15, 20, 25]}]

rf = RandomForestClassifier(class_weight='balanced')
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')

# Extend the preprocessing pipeline to include random forest and grid search
pipeline = make_pipeline(preprocessing_pipeline, grid_search)

# Fit the model and tune the hyperparameters
pipeline.fit(train, labels_train)









    Out[7]:





Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('continuous_pipeline', Pipeline(steps=[('selector', DataSelector(attribute_indices=[0, 1, 2, 3, 4])), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('categorical_pipeline', Pipeline(steps=[('selector'...*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0))])



In [8]:

    
# Hyperparameters found by GridSearchCV
print(pipeline.named_steps['gridsearchcv'].best_params_)









    



{'max_depth': 15, 'max_features': 8, 'min_samples_split': 12, 'n_estimators': 24}



In [9]:

    
# Evalute random forest classifier on training data (it overfits, small sample size)
y_predict = pipeline.predict(train)
f1 = f1_score(labels_train, y_predict)
print('F1 on train:', f1)

# Evalute random forest classifier on test data
y_predict = pipeline.predict(test)
f1 = f1_score(labels_test, y_predict)
print('F1 on test:', f1)









    



F1 on train: 0.950226244344
F1 on test: 0.891530460624



In [22]:

    
from sklearn.metrics import roc_auc_score
roc_auc_score(labels_test, pipeline.predict(test))









    Out[22]:





0.77190026954177893



In [10]:

    
# Get feature importances of random forest model ("global interpretability")
best_estimator = pipeline.named_steps['gridsearchcv'].best_estimator_

importances = best_estimator.feature_importances_
std = np.std([tree.feature_importances_ for tree in best_estimator.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
feature_names = data.columns
for f in range(data.shape[1]):
    print("%2d. feature %s (%f)" %
          (f + 1, feature_names[f], importances[indices[f]]))









    



Feature ranking:
 1. feature age (0.217943)
 2. feature partner_age (0.083152)
 3. feature age_diff_abs (0.077557)
 4. feature children (0.070348)
 5. feature visits_relatives (0.048671)
 6. feature education (0.045273)
 7. feature marital_status (0.030306)
 8. feature partner_education (0.024689)
 9. feature gender (0.018723)
10. feature house (0.016342)
11. feature income (0.014837)
12. feature msa (0.014315)
13. feature rent (0.011337)
14. feature political (0.010854)
15. feature religion (0.010586)
16. feature work (0.010496)
17. feature gender_older (0.010019)
18. feature education_difference (0.009984)



In [11]:

    
# Use LIME to explain individual predictions, initialize explainer object
explainer = LimeTabularExplainer(
    train,
    class_names=['BrokeUp', 'StayedTogether'],
    feature_names=list(data.columns),
    categorical_features=categorical_idx,
    categorical_names=categorical_names,
    discretize_continuous=True
)



In [12]:

    
# Explain a prediction ("local interpretability"): 
# Now we see that the pipeline that takes raw data and returns the prediction 
# of the trained model now comes in conveniently.
example = 3
exp = explainer.explain_instance(test[example], pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()









    



Couples probability of staying together: 0.912100802305






    Out[12]:





<matplotlib.figure.Figure at 0x10dbf0748>



In [13]:

    
# Explain another prediction ("local interpretability"): 
example = 13
exp = explainer.explain_instance(test[example], pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()









    



Couples probability of staying together: 0.718095959199






    Out[13]:



In [14]:

    
# and we see differences in explaining the model's predictions.



In [15]:

    
# Using LIME for for relationship management (not advised): Current chance of relationship success.
current = [34, 36, 2, 0, 1, 0, 1, 0, 0, 0, 18, 0, 2, 0, 8, 4, 0, 0]
exp = explainer.explain_instance(np.array(current), pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()









    



Couples probability of staying together: 0.604344922264






    Out[15]:



In [16]:

    
# Should I ask for a pay increase? It doesn't matter much.
increase_income = [34, 36, 2, 0, 1, 0, 1, 0, 0, 0, 6, 0, 2, 0, 8, 4, 0, 0]
exp = explainer.explain_instance(np.array(increase_income), pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()









    



Couples probability of staying together: 0.604344922264






    Out[16]:



In [17]:

    
# Should I buy a house? Maybe?
buy_house = [34, 36, 2, 0, 1, 0, 1, 0, 0, 0, 6, 0, 1, 0, 8, 4, 0, 0]
exp = explainer.explain_instance(np.array(buy_house), pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()









    



Couples probability of staying together: 0.69376844127






    Out[17]:



In [18]:

    
# Really, it's best to get married.
get_married = [34, 36, 2, 0, 1, 0, 2, 0, 0, 0, 6, 0, 1, 0, 8, 4, 0, 0]
exp = explainer.explain_instance(np.array(get_married), pipeline.predict_proba, num_features=5)
print('Couples probability of staying together:', exp.predict_proba[1])
exp.as_pyplot_figure()









    



Couples probability of staying together: 0.709874246329






    Out[18]:



In [ ]:

	age	partner_age	age_diff_abs	visits_relatives	education	marital_status	partner_education	gender	house	income	msa	rent	political	religion	work	gender_older	education_difference
0	52	48	4	0	bachelor's degree or higher	living with partner	some college	female	a building with 2 or more apartments	$20,000 to $24,999	metro	rented for cash	democrat	catholic	working - as a paid employee	1	1
1	28	30	2	0	bachelor's degree or higher	living with partner	bachelor's degree or higher	female	a building with 2 or more apartments	$40,000 to $49,999	metro	rented for cash	democrat	jewish	working - as a paid employee	0	0
2	31	40	9	1	some college	never married	high school	male	a building with 2 or more apartments	$40,000 to $49,999	metro	owned or being bought by you or someone in you...	democrat	other non-christian, please specify:	working - as a paid employee	1	1
3	53	55	2	1	bachelor's degree or higher	living with partner	bachelor's degree or higher	male	a one-family house detached from any other house	$125,000 to $149,999	metro	owned or being bought by you or someone in you...	democrat	protestant (e.g., methodist, lutheran, presbyt...	working - as a paid employee	1	0
4	58	51	7	0	bachelor's degree or higher	separated	bachelor's degree or higher	male	a building with 2 or more apartments	$15,000 to $19,999	metro	rented for cash	democrat	protestant (e.g., methodist, lutheran, presbyt...	working - as a paid employee	0	0