This is a companion notebook to the blog post Feature importance for data frame analytics with Elastic machine learning.
In [1]:
%load_ext autoreload
%autoreload 2
from itertools import groupby
from operator import itemgetter
import pprint
import eland
from elasticsearch import Elasticsearch, helpers
import ipywidgets as widgets
import numpy as np
import matplotlib.pyplot as pl
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import requests
import seaborn as sns
This notebook assumes that you have Elasticsearch 7.6 running locally on the port 9200.
In [2]:
# Some general notebook setting
sns.set_style("whitegrid")
sns.set_color_codes("muted")
sns.set_context("talk")
host = 'http://localhost:9200'
es = Elasticsearch(host)
analysis_job_id = "world-happiness-report"
dataset_index = "world-happiness-report"
result_index = "whr-regression-results"
write_images = False # this variable controle writing images for the blog post
In [3]:
# Get the dataset direclty from the official source
df = pd.read_excel('https://s3.amazonaws.com/happiness-report/2019/Chapter2OnlineData.xls', 'Table2.1')
dataset = df.loc[:, :'Negative affect']
dataset = dataset.dropna()
dataset.columns = [col.lower().replace(' ', '_') for col in dataset.columns]
input_features = list(dataset.columns)
input_features.remove('country_name')
input_features.remove('year')
input_features.remove('life_ladder')
In [4]:
dataset.head()
Out[4]:
We use eland
to upload the dataset to Elasticsearch.
In [5]:
eland.pandas_to_eland(dataset, es, dataset_index, es_if_exists='replace', es_refresh=True)
Out[5]:
Now, we create a data frame analytics regression job to analyze the data. We exclude fields year
, country_name
and country_name.keyword
from analysis, since we don't want those to influence our model.
Note that we set the parameter num_top_feature_importance_values
to 8, meaning we want to get feature importance for all input feature.
In [6]:
api = '/_ml/data_frame/analytics/{}'.format(analysis_job_id)
config = {
"id": analysis_job_id,
"description": "",
"source": {
"index": [
dataset_index
],
"query": {
"match_all": {}
}
},
"dest": {
"index": result_index,
"results_field": "ml"
},
"analysis": {
"regression": {
"dependent_variable": "life_ladder",
"num_top_feature_importance_values": 8,
}
},
"analyzed_fields": {
"includes": [],
"excludes": [
"year",
"country_name"
]
}
}
pprint.pprint(requests.put(host+api, json=config).json())
Let's start the data frame analytics job. If everything goes smoothly, we receive {'acknowledged': True}
as a result.
In [7]:
api = "/_ml/data_frame/analytics/{}/_start".format(analysis_job_id)
print(requests.post(host+api).json())
The analysis can take a couple of minutes. We can query the _stats
API for progress. Once, all phases are at 100% and the state
is "stopped", the job is done.
In [8]:
api = "/_ml/data_frame/analytics/{}/_stats".format(analysis_job_id)
pprint.pprint(requests.get(host+api).json())
We use eland
to read out the results as a data frame.
In [9]:
result = eland.DataFrame(host, result_index)
display(result.describe())
For our analysis we focus only on the data from the latest survey in 2018.
In [10]:
dataset = eland.eland_to_pandas(result[result.year==2018])
dataset.index = dataset.country_name
# Note that data frame analytics reports only the feature importance values that are different from zero.
# Hence, we need to fill the missing values in the data frame with zeros.
dataset = dataset.fillna(0)
# feature improtance baseline is the prediction average for the training set
base_line = result[result['ml.is_training'] == True]['ml.life_ladder_prediction'].mean()
For convenience, we create a data frame consisting solely of the feature importance values.
In [11]:
feature_importance_df = dataset[['ml.feature_importance.{}'.format(feature) for feature in input_features]]
feature_importance_df.columns = input_features
feature_importance_df.head()
Out[11]:
The figure above shows the decision path our model takes when predicting the happiness score of Argentina. The path starts at the baseline of 5.41 and then incrementally adds the feature importance values until it finally results in the predicted happiness score of 5.83. If the decision path goes left, the feature has a negative effect on the model prediction (e .g., “generosity”). If the decision path goes right, the feature has a positive effect (e.g., “healthy_life_expectancy_at_birth”).
In [12]:
def plot_decision(countries, annotate=False):
y = np.arange(0, len(input_features)+1)
x = {c:[base_line] for c in countries}
pl.plot([base_line]*y.size, y, c='k', label='baseline')
for feature in input_features[::-1]:
feature_importance_column = 'ml.feature_importance.'+feature
for country in countries:
x[country].append(x[country][-1] + dataset.loc[country, feature_importance_column])
for country in countries:
pl.plot(x[country], y, '.-', label=country)
if annotate:
pl.annotate("{:.2f}".format(x[country][-1]),
xy=(x[country][-1], y[-1]),
xytext=(0,4),
textcoords="offset points", ha='center')
pl.yticks(ticks=np.arange(len(input_features)+1), labels=['baseline'] + input_features[::-1])
pl.xlim(dataset['ml.life_ladder_prediction'].min(), dataset['ml.life_ladder_prediction'].max())
pl.xlabel("Prediction")
pl.title("Decision path")
pl.legend()
pl.grid(True)
pl.Figure(figsize=(400,300))
plot_decision(["Argentina"])
if write_images:
pl.savefig("images/decision.png", bbox_inches = "tight")
pl.show()
Since feature importance is computed for individual data points, we, of course, can aggregate those to average magnitudes over the complete dataset and get a quick summary of which features are, in general, more important than others.
In [13]:
# aggregate the feature importance values
total_feature_importance = feature_importance_df.abs().mean().sort_values(ascending=False).to_frame().reset_index()
total_feature_importance.columns = ['feature', 'value']
total_feature_importance.head()
Out[13]:
In [14]:
barplot = sns.barplot(data=total_feature_importance, x='value', y='feature', color='b')
barplot.set_xlabel("Feature importance avg. magnitude", fontsize='small')
barplot.set_ylabel("")
barplot.set_title("Feature importance summary")
if write_images:
fig = barplot.get_figure()
fig.savefig("images/total_feature_importance.png", bbox_inches = "tight")
Here, we see a combination of a scatter plot and a violin plot showing the importance values for individual features. Features with higher overall feature importance tend to have a more extensive spread from minimum to maximum.
In [15]:
# Prepare data in the right format
fi = feature_importance_df.stack().reset_index()
fi.columns = ['country_name', 'feature', 'importance']
fi.head()
Out[15]:
In [16]:
# Estimate optimal kernel density bandwidth for the violin plot
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.neighbors import KernelDensity
# We can have only one bandwidth, we choose the best for the most
# important feature
most_important_feature = total_feature_importance['feature'][0]
x = fi[fi.feature==most_important_feature]['importance'].to_numpy()
bandwidths = 10 ** np.linspace(-1, 1, 100)
grid = GridSearchCV(KernelDensity(kernel='gaussian'),
{'bandwidth': bandwidths},
cv=LeaveOneOut())
grid.fit(x.reshape(-1,1));
bw = grid.best_params_['bandwidth']
In [17]:
pl.figure(figsize=(15, 10))
ax = sns.violinplot(x='importance', y='feature', data=fi, order=total_feature_importance['feature'],
inner=None, color='.8', scale='count', linewidth=0, bw=bw)
ax = sns.stripplot(x='importance', y='feature', data=fi, order=total_feature_importance['feature'],
color='b')
ax.set_title("Feature importance distribution")
ax.set_xlabel("Feature importance")
ax.set_ylabel("")
if write_images:
fig = ax.get_figure()
fig.savefig("images/feature_importance_distribution.png", bbox_inches = "tight")
Dependence plot show the feature importance values as a function of feature values. Additionally, the color of the markers indicate the target score.
In [18]:
def dependence_plot(feature, annotations):
text = ["{}: {:.1f}".format(row.name, row['life_ladder']) for _, row in dataset.iterrows()]
text = [t.replace('\n', '<br>') for t in text]
fig = go.Figure()
feature_importance_column = 'ml.feature_importance.'+feature
target = dataset['life_ladder']
fig.add_trace(go.Scatter(x = dataset[feature], y=dataset[feature_importance_column],
mode='markers',
text=text,
marker=dict(size=8, color=target, showscale=True, line_width=1,
colorscale='Bluered',
colorbar=dict(title="Happiness",
tickvals=[target.min(), target.max()],
ticktext=['low', 'high']))))
fig.update_layout(
title='Dependence plot',
xaxis_title='Value of \"{}\"'.format(feature),
yaxis_title="Importance of \"{}\"".format(feature),
template='plotly_white',
font=dict(size=16)
)
for annotation, ay in annotations:
fig.add_annotation(
go.layout.Annotation(
x=dataset.loc[annotation, feature],
y=dataset.loc[annotation, feature_importance_column],
text=annotation,
xref="x",
yref="y",
showarrow=True,
arrowhead=7,
ax=0,
ay=ay,
font = dict(size=12)
)
)
if write_images:
fig.write_image('images/dependence_plot_{}.png'.format(feature), width=800, height=600)
fig.show()
In [19]:
dependence_plot("healthy_life_expectancy_at_birth", [('Switzerland', -20), ('Japan', 10),
('South Korea', 40), ('Spain',10), ('France', -10),
('Chad', 10), ('Ivory Coast', -10), ('Nigeria', 10),
('Mali', 10), ('Sierra Leone',-15)])
In [22]:
dependence_plot("log_gdp_per_capita", [("Luxembourg", -10), ("United Kingdom", -40)])
In [ ]: