Better Understand Your Data with Feature Importance and Data Frame Analytics

This is a companion notebook to the blog post Feature importance for data frame analytics with Elastic machine learning.



In [1]:

    
%load_ext autoreload
%autoreload 2

from itertools import groupby
from operator import itemgetter
import pprint

import eland
from elasticsearch import Elasticsearch, helpers
import ipywidgets as widgets
import numpy as np
import matplotlib.pyplot as pl
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import requests
import seaborn as sns

This notebook assumes that you have Elasticsearch 7.6 running locally on the port 9200.



In [2]:

    
# Some general notebook setting
sns.set_style("whitegrid")
sns.set_color_codes("muted")
sns.set_context("talk")
host = 'http://localhost:9200'
es = Elasticsearch(host)
analysis_job_id = "world-happiness-report"
dataset_index = "world-happiness-report"
result_index = "whr-regression-results"
write_images = False # this variable controle writing images for the blog post



In [3]:

    
# Get the dataset direclty from the official source
df = pd.read_excel('https://s3.amazonaws.com/happiness-report/2019/Chapter2OnlineData.xls', 'Table2.1')
dataset = df.loc[:, :'Negative affect']
dataset = dataset.dropna()
dataset.columns = [col.lower().replace(' ', '_') for col in dataset.columns]
input_features = list(dataset.columns)
input_features.remove('country_name')
input_features.remove('year')
input_features.remove('life_ladder')



In [4]:

    
dataset.head()









    Out[4]:







  
    
      
      country_name
      year
      life_ladder
      log_gdp_per_capita
      social_support
      healthy_life_expectancy_at_birth
      freedom_to_make_life_choices
      generosity
      perceptions_of_corruption
      positive_affect
      negative_affect
    
  
  
    
      0
      Afghanistan
      2008
      3.723590
      7.168690
      0.450662
      50.799999
      0.718114
      0.177889
      0.881686
      0.517637
      0.258195
    
    
      1
      Afghanistan
      2009
      4.401778
      7.333790
      0.552308
      51.200001
      0.678896
      0.200178
      0.850035
      0.583926
      0.237092
    
    
      2
      Afghanistan
      2010
      4.758381
      7.386629
      0.539075
      51.599998
      0.600127
      0.134353
      0.706766
      0.618265
      0.275324
    
    
      3
      Afghanistan
      2011
      3.831719
      7.415019
      0.521104
      51.919998
      0.495901
      0.172137
      0.731109
      0.611387
      0.267175
    
    
      4
      Afghanistan
      2012
      3.782938
      7.517126
      0.520637
      52.240002
      0.530935
      0.244273
      0.775620
      0.710385
      0.267919

We use eland to upload the dataset to Elasticsearch.



In [5]:

    
eland.pandas_to_eland(dataset, es, dataset_index, es_if_exists='replace', es_refresh=True)









    Out[5]:







  
    
      
      country_name
      freedom_to_make_life_choices
      generosity
      healthy_life_expectancy_at_birth
      life_ladder
      log_gdp_per_capita
      negative_affect
      perceptions_of_corruption
      positive_affect
      social_support
      year
    
  
  
    
      0
      Afghanistan
      0.718114
      0.177889
      50.799999
      3.723590
      7.168690
      0.258195
      0.881686
      0.517637
      0.450662
      2008
    
    
      1
      Afghanistan
      0.678896
      0.200178
      51.200001
      4.401778
      7.333790
      0.237092
      0.850035
      0.583926
      0.552308
      2009
    
    
      2
      Afghanistan
      0.600127
      0.134353
      51.599998
      4.758381
      7.386629
      0.275324
      0.706766
      0.618265
      0.539075
      2010
    
    
      3
      Afghanistan
      0.495901
      0.172137
      51.919998
      3.831719
      7.415019
      0.267175
      0.731109
      0.611387
      0.521104
      2011
    
    
      4
      Afghanistan
      0.530935
      0.244273
      52.240002
      3.782938
      7.517126
      0.267919
      0.775620
      0.710385
      0.520637
      2012
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1699
      Zimbabwe
      0.642034
      -0.048634
      52.380001
      4.184451
      7.562753
      0.239111
      0.820217
      0.725214
      0.765839
      2014
    
    
      1700
      Zimbabwe
      0.667193
      -0.097354
      53.799999
      3.703191
      7.556052
      0.178861
      0.810457
      0.715079
      0.735800
      2015
    
    
      1701
      Zimbabwe
      0.732971
      -0.068105
      54.400002
      3.735400
      7.538829
      0.208555
      0.723612
      0.737636
      0.768425
      2016
    
    
      1702
      Zimbabwe
      0.752826
      -0.069670
      55.000000
      3.638300
      7.549491
      0.224051
      0.751208
      0.806428
      0.754147
      2017
    
    
      1703
      Zimbabwe
      0.762675
      -0.038384
      55.599998
      3.616480
      7.553395
      0.211726
      0.844209
      0.710119
      0.775388
      2018
    
  


1512 rows × 11 columns

Create Regression Job

Now, we create a data frame analytics regression job to analyze the data. We exclude fields year, country_name and country_name.keyword from analysis, since we don't want those to influence our model.

Note that we set the parameter num_top_feature_importance_values to 8, meaning we want to get feature importance for all input feature.



In [6]:

    
api = '/_ml/data_frame/analytics/{}'.format(analysis_job_id)
config = {
  "id": analysis_job_id,
  "description": "",
  "source": {
    "index": [
      dataset_index
    ],
    "query": {
      "match_all": {}
    }
  },
  "dest": {
    "index": result_index,
    "results_field": "ml"
  },
  "analysis": {
    "regression": {
      "dependent_variable": "life_ladder",
      "num_top_feature_importance_values": 8, 
    }
  },
  "analyzed_fields": {
    "includes": [],
    "excludes": [
      "year",
      "country_name"
    ]
  }
}


pprint.pprint(requests.put(host+api, json=config).json())









    



{'allow_lazy_start': False,
 'analysis': {'regression': {'dependent_variable': 'life_ladder',
                             'num_top_feature_importance_values': 8,
                             'prediction_field_name': 'life_ladder_prediction',
                             'randomize_seed': -2725393660543450313,
                             'training_percent': 100.0}},
 'analyzed_fields': {'excludes': ['year', 'country_name'], 'includes': []},
 'create_time': 1582205185677,
 'description': '',
 'dest': {'index': 'whr-regression-results', 'results_field': 'ml'},
 'id': 'world-happiness-report',
 'model_memory_limit': '1gb',
 'source': {'index': ['world-happiness-report'], 'query': {'match_all': {}}},
 'version': '7.6.0'}

Start Analysis

Let's start the data frame analytics job. If everything goes smoothly, we receive {'acknowledged': True} as a result.



In [7]:

    
api = "/_ml/data_frame/analytics/{}/_start".format(analysis_job_id)
print(requests.post(host+api).json())









    



{'acknowledged': True}

The analysis can take a couple of minutes. We can query the _stats API for progress. Once, all phases are at 100% and the state is "stopped", the job is done.



In [8]:

    
api = "/_ml/data_frame/analytics/{}/_stats".format(analysis_job_id)
pprint.pprint(requests.get(host+api).json())









    



{'count': 1,
 'data_frame_analytics': [{'id': 'world-happiness-report',
                           'progress': [{'phase': 'reindexing',
                                         'progress_percent': 100},
                                        {'phase': 'loading_data',
                                         'progress_percent': 100},
                                        {'phase': 'analyzing',
                                         'progress_percent': 100},
                                        {'phase': 'writing_results',
                                         'progress_percent': 100}],
                           'state': 'stopped'}]}

We use eland to read out the results as a data frame.



In [9]:

    
result = eland.DataFrame(host, result_index)
display(result.describe())









    







  
    
      
      freedom_to_make_life_choices
      generosity
      healthy_life_expectancy_at_birth
      life_ladder
      log_gdp_per_capita
      ml.feature_importance.freedom_to_make_life_choices
      ml.feature_importance.generosity
      ml.feature_importance.healthy_life_expectancy_at_birth
      ml.feature_importance.log_gdp_per_capita
      ml.feature_importance.negative_affect
      ml.feature_importance.perceptions_of_corruption
      ml.feature_importance.positive_affect
      ml.feature_importance.social_support
      ml.life_ladder_prediction
      negative_affect
      perceptions_of_corruption
      positive_affect
      social_support
      year
    
  
  
    
      count
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
      1512.000000
    
    
      mean
      0.729690
      0.001999
      62.959863
      5.409960
      9.171052
      -0.004103
      -0.002247
      0.019618
      0.007107
      0.000812
      -0.011679
      -0.003905
      -0.005602
      5.410009
      0.265886
      0.754519
      0.707982
      0.807877
      2012.413360
    
    
      std
      0.145450
      0.163199
      7.757576
      1.135926
      1.185309
      0.079271
      0.062254
      0.544845
      0.283028
      0.049033
      0.070746
      0.176999
      0.248083
      1.129786
      0.082820
      0.186170
      0.108593
      0.121685
      3.587590
    
    
      min
      0.257534
      -0.336385
      32.299999
      2.661718
      6.457201
      -0.391376
      -0.282606
      -1.156878
      -0.649802
      -0.292035
      -0.348986
      -0.612048
      -0.803056
      2.625077
      0.094316
      0.035198
      0.362498
      0.290184
      2005.000000
    
    
      25%
      0.634053
      -0.109475
      57.680834
      4.555756
      8.222128
      -0.046729
      -0.040778
      -0.518731
      -0.194008
      -0.029062
      -0.052148
      -0.120019
      -0.194140
      4.553412
      0.206123
      0.700817
      0.619016
      0.740086
      2009.076923
    
    
      50%
      0.747040
      -0.020337
      64.900002
      5.306705
      9.363767
      0.002419
      -0.001344
      0.034621
      0.011989
      0.008435
      -0.010465
      -0.009805
      -0.006797
      5.299802
      0.253639
      0.808815
      0.716696
      0.831893
      2013.000000
    
    
      75%
      0.846806
      0.095960
      68.393416
      6.217162
      10.130584
      0.047567
      0.036186
      0.486662
      0.224262
      0.034174
      0.030711
      0.118931
      0.222982
      6.209506
      0.313533
      0.879489
      0.800659
      0.904775
      2015.428571
    
    
      max
      0.985178
      0.669101
      76.500000
      7.970892
      11.670484
      0.227002
      0.253046
      0.988484
      0.737218
      0.158346
      0.243821
      0.537137
      0.458581
      7.924716
      0.704590
      0.983276
      0.943621
      0.987343
      2018.000000

Data Analysis

For our analysis we focus only on the data from the latest survey in 2018.



In [10]:

    
dataset = eland.eland_to_pandas(result[result.year==2018])
dataset.index = dataset.country_name
# Note that data frame analytics reports only the feature importance values that are different from zero.
# Hence, we need to fill the missing values in the data frame with zeros.
dataset = dataset.fillna(0) 
# feature improtance baseline is the prediction average for the training set
base_line = result[result['ml.is_training'] == True]['ml.life_ladder_prediction'].mean()

For convenience, we create a data frame consisting solely of the feature importance values.



In [11]:

    
feature_importance_df = dataset[['ml.feature_importance.{}'.format(feature) for feature in input_features]]
feature_importance_df.columns = input_features
feature_importance_df.head()









    Out[11]:







  
    
      
      log_gdp_per_capita
      social_support
      healthy_life_expectancy_at_birth
      freedom_to_make_life_choices
      generosity
      perceptions_of_corruption
      positive_affect
      negative_affect
    
    
      country_name
      
      
      
      
      
      
      
      
    
  
  
    
      Afghanistan
      -0.593632
      -0.480017
      -0.782510
      -0.126955
      -0.068387
      -0.102996
      -0.429456
      -0.136437
    
    
      Moldova
      -0.090517
      0.183065
      0.080556
      0.143671
      0.038303
      -0.028691
      -0.144087
      0.051797
    
    
      Mongolia
      -0.075222
      0.253702
      -0.052248
      0.040877
      0.003333
      0.023333
      -0.171364
      0.017292
    
    
      Montenegro
      0.018458
      -0.086684
      0.572892
      -0.021426
      -0.027756
      0.049633
      -0.230666
      -0.050989
    
    
      Morocco
      -0.042927
      -0.353456
      -0.042791
      0.043565
      0.015816
      0.012439
      -0.074167
      -0.075258

Feature Importance

The figure above shows the decision path our model takes when predicting the happiness score of Argentina. The path starts at the baseline of 5.41 and then incrementally adds the feature importance values until it finally results in the predicted happiness score of 5.83. If the decision path goes left, the feature has a negative effect on the model prediction (e .g., “generosity”). If the decision path goes right, the feature has a positive effect (e.g., “healthy_life_expectancy_at_birth”).



In [12]:

    
def plot_decision(countries, annotate=False):
    y = np.arange(0, len(input_features)+1)
    x = {c:[base_line] for c in countries}

    pl.plot([base_line]*y.size, y, c='k', label='baseline')
    
    for feature in input_features[::-1]:
        feature_importance_column = 'ml.feature_importance.'+feature
        for country in countries:
            x[country].append(x[country][-1] + dataset.loc[country, feature_importance_column])
    for country in countries:
        pl.plot(x[country], y, '.-', label=country)
        if annotate:
            pl.annotate("{:.2f}".format(x[country][-1]), 
                    xy=(x[country][-1], y[-1]),
                   xytext=(0,4),
                   textcoords="offset points", ha='center')
    
    pl.yticks(ticks=np.arange(len(input_features)+1), labels=['baseline'] + input_features[::-1])
    pl.xlim(dataset['ml.life_ladder_prediction'].min(), dataset['ml.life_ladder_prediction'].max())
    pl.xlabel("Prediction")
    pl.title("Decision path")
    pl.legend()
    pl.grid(True)
    
    
pl.Figure(figsize=(400,300))
plot_decision(["Argentina"])
if write_images:
    pl.savefig("images/decision.png", bbox_inches = "tight")
pl.show()

Feature Importance Summary Barplot

Since feature importance is computed for individual data points, we, of course, can aggregate those to average magnitudes over the complete dataset and get a quick summary of which features are, in general, more important than others.



In [13]:

    
# aggregate the feature importance values
total_feature_importance = feature_importance_df.abs().mean().sort_values(ascending=False).to_frame().reset_index()
total_feature_importance.columns = ['feature', 'value']
total_feature_importance.head()









    Out[13]:







  
    
      
      feature
      value
    
  
  
    
      0
      healthy_life_expectancy_at_birth
      0.484172
    
    
      1
      log_gdp_per_capita
      0.223408
    
    
      2
      social_support
      0.213599
    
    
      3
      positive_affect
      0.142607
    
    
      4
      freedom_to_make_life_choices
      0.056362



In [14]:

    
barplot = sns.barplot(data=total_feature_importance, x='value', y='feature', color='b')
barplot.set_xlabel("Feature importance avg. magnitude", fontsize='small')
barplot.set_ylabel("")
barplot.set_title("Feature importance summary")
if write_images:
    fig = barplot.get_figure()
    fig.savefig("images/total_feature_importance.png", bbox_inches = "tight")

Feature Importance Distribution Plot

Here, we see a combination of a scatter plot and a violin plot showing the importance values for individual features. Features with higher overall feature importance tend to have a more extensive spread from minimum to maximum.



In [15]:

    
# Prepare data in the right format
fi = feature_importance_df.stack().reset_index()
fi.columns = ['country_name', 'feature', 'importance']
fi.head()









    Out[15]:







  
    
      
      country_name
      feature
      importance
    
  
  
    
      0
      Afghanistan
      log_gdp_per_capita
      -0.593632
    
    
      1
      Afghanistan
      social_support
      -0.480017
    
    
      2
      Afghanistan
      healthy_life_expectancy_at_birth
      -0.782510
    
    
      3
      Afghanistan
      freedom_to_make_life_choices
      -0.126955
    
    
      4
      Afghanistan
      generosity
      -0.068387



In [16]:

    
# Estimate optimal kernel density bandwidth for the violin plot
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.neighbors import KernelDensity

# We can have only one bandwidth, we choose the best for the most 
# important feature
most_important_feature = total_feature_importance['feature'][0]
x = fi[fi.feature==most_important_feature]['importance'].to_numpy()
bandwidths = 10 ** np.linspace(-1, 1, 100)
grid = GridSearchCV(KernelDensity(kernel='gaussian'),
                    {'bandwidth': bandwidths},
                    cv=LeaveOneOut())
grid.fit(x.reshape(-1,1));
bw = grid.best_params_['bandwidth']



In [17]:

    
pl.figure(figsize=(15, 10))
ax = sns.violinplot(x='importance', y='feature', data=fi, order=total_feature_importance['feature'],
                  inner=None, color='.8', scale='count', linewidth=0, bw=bw)
ax = sns.stripplot(x='importance', y='feature', data=fi, order=total_feature_importance['feature'], 
                   color='b')
ax.set_title("Feature importance distribution")
ax.set_xlabel("Feature importance")
ax.set_ylabel("")
if write_images:
    fig = ax.get_figure()
    fig.savefig("images/feature_importance_distribution.png", bbox_inches = "tight")

Dependency Plot

Dependence plot show the feature importance values as a function of feature values. Additionally, the color of the markers indicate the target score.



In [18]:

    
def dependence_plot(feature, annotations):
    text = ["{}: {:.1f}".format(row.name, row['life_ladder']) for _, row in dataset.iterrows()]
    text = [t.replace('\n', '<br>') for t in text]
    fig = go.Figure()
    
    feature_importance_column = 'ml.feature_importance.'+feature
    target = dataset['life_ladder']
    fig.add_trace(go.Scatter(x = dataset[feature], y=dataset[feature_importance_column], 
                             mode='markers', 
                             text=text, 
                            marker=dict(size=8, color=target, showscale=True, line_width=1,
                                        colorscale='Bluered',
                                       colorbar=dict(title="Happiness", 
                                                     tickvals=[target.min(), target.max()], 
                                                     ticktext=['low', 'high']))))
    fig.update_layout(
        title='Dependence plot',
        xaxis_title='Value of \"{}\"'.format(feature), 
        yaxis_title="Importance of \"{}\"".format(feature),
        template='plotly_white',
        font=dict(size=16)
    )
    
    for annotation, ay in annotations:
        fig.add_annotation(
            go.layout.Annotation(
                x=dataset.loc[annotation, feature],
                y=dataset.loc[annotation, feature_importance_column],
                text=annotation,
                xref="x",
                yref="y",
                showarrow=True,
                arrowhead=7,
                ax=0,
                ay=ay,
                font = dict(size=12)
            )
        )
    if write_images:
        fig.write_image('images/dependence_plot_{}.png'.format(feature), width=800, height=600)
    fig.show()



In [19]:

    
dependence_plot("healthy_life_expectancy_at_birth", [('Switzerland', -20), ('Japan', 10), 
                                                     ('South Korea', 40), ('Spain',10), ('France', -10),
                                                    ('Chad', 10), ('Ivory Coast', -10), ('Nigeria', 10),
                                                    ('Mali', 10), ('Sierra Leone',-15)])



In [22]:

    
dependence_plot("log_gdp_per_capita", [("Luxembourg", -10), ("United Kingdom", -40)])



In [ ]:

	country_name	year	life_ladder	log_gdp_per_capita	social_support	healthy_life_expectancy_at_birth	freedom_to_make_life_choices	generosity	perceptions_of_corruption	positive_affect	negative_affect
0	Afghanistan	2008	3.723590	7.168690	0.450662	50.799999	0.718114	0.177889	0.881686	0.517637	0.258195
1	Afghanistan	2009	4.401778	7.333790	0.552308	51.200001	0.678896	0.200178	0.850035	0.583926	0.237092
2	Afghanistan	2010	4.758381	7.386629	0.539075	51.599998	0.600127	0.134353	0.706766	0.618265	0.275324
3	Afghanistan	2011	3.831719	7.415019	0.521104	51.919998	0.495901	0.172137	0.731109	0.611387	0.267175
4	Afghanistan	2012	3.782938	7.517126	0.520637	52.240002	0.530935	0.244273	0.775620	0.710385	0.267919

	freedom_to_make_life_choices	generosity	healthy_life_expectancy_at_birth	life_ladder	log_gdp_per_capita	ml.feature_importance.freedom_to_make_life_choices	ml.feature_importance.generosity	ml.feature_importance.healthy_life_expectancy_at_birth	ml.feature_importance.log_gdp_per_capita	ml.feature_importance.negative_affect	ml.feature_importance.perceptions_of_corruption	ml.feature_importance.positive_affect	ml.feature_importance.social_support	ml.life_ladder_prediction	negative_affect	perceptions_of_corruption	positive_affect	social_support	year
count	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000	1512.000000
mean	0.729690	0.001999	62.959863	5.409960	9.171052	-0.004103	-0.002247	0.019618	0.007107	0.000812	-0.011679	-0.003905	-0.005602	5.410009	0.265886	0.754519	0.707982	0.807877	2012.413360
std	0.145450	0.163199	7.757576	1.135926	1.185309	0.079271	0.062254	0.544845	0.283028	0.049033	0.070746	0.176999	0.248083	1.129786	0.082820	0.186170	0.108593	0.121685	3.587590
min	0.257534	-0.336385	32.299999	2.661718	6.457201	-0.391376	-0.282606	-1.156878	-0.649802	-0.292035	-0.348986	-0.612048	-0.803056	2.625077	0.094316	0.035198	0.362498	0.290184	2005.000000
25%	0.634053	-0.109475	57.680834	4.555756	8.222128	-0.046729	-0.040778	-0.518731	-0.194008	-0.029062	-0.052148	-0.120019	-0.194140	4.553412	0.206123	0.700817	0.619016	0.740086	2009.076923
50%	0.747040	-0.020337	64.900002	5.306705	9.363767	0.002419	-0.001344	0.034621	0.011989	0.008435	-0.010465	-0.009805	-0.006797	5.299802	0.253639	0.808815	0.716696	0.831893	2013.000000
75%	0.846806	0.095960	68.393416	6.217162	10.130584	0.047567	0.036186	0.486662	0.224262	0.034174	0.030711	0.118931	0.222982	6.209506	0.313533	0.879489	0.800659	0.904775	2015.428571
max	0.985178	0.669101	76.500000	7.970892	11.670484	0.227002	0.253046	0.988484	0.737218	0.158346	0.243821	0.537137	0.458581	7.924716	0.704590	0.983276	0.943621	0.987343	2018.000000

	log_gdp_per_capita	social_support	healthy_life_expectancy_at_birth	freedom_to_make_life_choices	generosity	perceptions_of_corruption	positive_affect	negative_affect
country_name
Afghanistan	-0.593632	-0.480017	-0.782510	-0.126955	-0.068387	-0.102996	-0.429456	-0.136437
Moldova	-0.090517	0.183065	0.080556	0.143671	0.038303	-0.028691	-0.144087	0.051797
Mongolia	-0.075222	0.253702	-0.052248	0.040877	0.003333	0.023333	-0.171364	0.017292
Montenegro	0.018458	-0.086684	0.572892	-0.021426	-0.027756	0.049633	-0.230666	-0.050989
Morocco	-0.042927	-0.353456	-0.042791	0.043565	0.015816	0.012439	-0.074167	-0.075258

	feature	value
0	healthy_life_expectancy_at_birth	0.484172
1	log_gdp_per_capita	0.223408
2	social_support	0.213599
3	positive_affect	0.142607
4	freedom_to_make_life_choices	0.056362